From 0a053627276e25066082b1cd8cc66103513b4fca Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 25 Apr 2025 23:23:56 +0000
Subject: [PATCH 01/65] plm template

---
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/model_doc/perception_lm.md     |  59 ++
 .../models/auto/configuration_auto.py         |   2 +
 src/transformers/models/auto/modeling_auto.py |   3 +
 .../models/auto/processing_auto.py            |   1 +
 .../models/auto/tokenization_auto.py          |   6 +
 .../models/perception_lm/__init__.py          |  29 +
 .../configuration_perception_lm.py            | 139 ++++
 .../convert_perception_lm_weights_to_hf.py    | 204 ++++++
 .../image_processing_perception_lm.py         | 435 ++++++++++++
 .../image_processing_perception_lm_fast.py    | 201 ++++++
 .../perception_lm/modeling_perception_lm.py   | 501 ++++++++++++++
 .../perception_lm/processing_perception_lm.py | 202 ++++++
 tests/models/perception_lm/__init__.py        |   0
 .../test_image_processing_perception_lm.py    | 236 +++++++
 .../test_modeling_perception_lm.py            | 625 ++++++++++++++++++
 .../test_processor_perception_lm.py           | 108 +++
 17 files changed, 2753 insertions(+)
 create mode 100644 docs/source/en/model_doc/perception_lm.md
 create mode 100644 src/transformers/models/perception_lm/__init__.py
 create mode 100644 src/transformers/models/perception_lm/configuration_perception_lm.py
 create mode 100644 src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
 create mode 100644 src/transformers/models/perception_lm/image_processing_perception_lm.py
 create mode 100644 src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
 create mode 100644 src/transformers/models/perception_lm/modeling_perception_lm.py
 create mode 100644 src/transformers/models/perception_lm/processing_perception_lm.py
 create mode 100644 tests/models/perception_lm/__init__.py
 create mode 100644 tests/models/perception_lm/test_image_processing_perception_lm.py
 create mode 100644 tests/models/perception_lm/test_modeling_perception_lm.py
 create mode 100644 tests/models/perception_lm/test_processor_perception_lm.py
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 0e5248d8980c..0aa143448c32 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1035,6 +1035,8 @@
         title: PaliGemma
       - local: model_doc/perceiver
         title: Perceiver
+      - local: model_doc/perception_lm
+        title: PerceptionLM
       - local: model_doc/phi4_multimodal
         title: Phi4 Multimodal
       - local: model_doc/pix2struct
diff --git a/docs/source/en/model_doc/perception_lm.md b/docs/source/en/model_doc/perception_lm.md
new file mode 100644
index 000000000000..b5875f9228a0
--- /dev/null
+++ b/docs/source/en/model_doc/perception_lm.md
@@ -0,0 +1,59 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PerceptionLM
+
+# PerceptionLM
+
+# PerceptionLM
+
+# PerceptionLM
+
+# PerceptionLM
+
+# PerceptionLM
+
+# PerceptionLM
+
+## Overview
+
+The PerceptionLM model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## PerceptionLMConfig
+
+[[autodoc]] PerceptionLMConfig
+
+## PerceptionLMProcessor
+
+[[autodoc]] PerceptionLMProcessor
+
+## PerceptionLMForConditionalGeneration
+
+[[autodoc]] PerceptionLMForConditionalGeneration
+    - forward
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index d7bf78fefe87..637b4f093512 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -203,6 +203,7 @@
         ("llama4", "Llama4Config"),
         ("llama4_text", "Llama4TextConfig"),
         ("llava", "LlavaConfig"),
+        ("perception_lm", "PerceptionLMConfig"),
         ("llava_next", "LlavaNextConfig"),
         ("llava_next_video", "LlavaNextVideoConfig"),
         ("llava_onevision", "LlavaOnevisionConfig"),
@@ -591,6 +592,7 @@
         ("llama4", "Llama4"),
         ("llama4_text", "Llama4ForCausalLM"),
         ("llava", "LLaVa"),
+        ("perception_lm", "PerceptionLM"),
         ("llava_next", "LLaVA-NeXT"),
         ("llava_next_video", "LLaVa-NeXT-Video"),
         ("llava_onevision", "LLaVA-Onevision"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 075e8e31f15b..d795f1473968 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -408,6 +408,7 @@
         ("janus", "JanusForConditionalGeneration"),
         ("layoutlm", "LayoutLMForMaskedLM"),
         ("llava", "LlavaForConditionalGeneration"),
+        ("perception_lm", "PerceptionLMForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
         ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
         ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
@@ -883,6 +884,7 @@
         ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
+        ("perception_lm", "PerceptionLMForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
         ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
         ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
@@ -927,6 +929,7 @@
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
         ("llama4", "Llama4ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
+        ("perception_lm", "PerceptionLMForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
         ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
         ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index e5bd673f6390..151b0b66a3c7 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -87,6 +87,7 @@
         ("layoutlmv3", "LayoutLMv3Processor"),
         ("llama4", "Llama4Processor"),
         ("llava", "LlavaProcessor"),
+        ("perception_lm", "PerceptionLMProcessor"),
         ("llava_next", "LlavaNextProcessor"),
         ("llava_next_video", "LlavaNextVideoProcessor"),
         ("llava_onevision", "LlavaOnevisionProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index c8656a710746..ebbc3ad2c3f2 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -464,6 +464,12 @@
                 None,
             ),
         ),
+        (
+            "perception_lm", 
+            (
+                "PerceptionLMTokenizer", "PerceptionLMTokenizerFast" if is_tokenizers_available() else None
+            )
+        ),
         (
             "persimmon",
             (
diff --git a/src/transformers/models/perception_lm/__init__.py b/src/transformers/models/perception_lm/__init__.py
new file mode 100644
index 000000000000..81c3ba93bcf4
--- /dev/null
+++ b/src/transformers/models/perception_lm/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_perception_lm import *
+    from .image_processing_perception_lm_fast import *
+    from .modeling_perception_lm import *
+    from .processing_perception_lm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
new file mode 100644
index 000000000000..79b3cce8549f
--- /dev/null
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2025 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PerceptionLM model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class PerceptionLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PerceptionLMForConditionalGeneration`]. It is used to instantiate an
+    PerceptionLM model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the PerceptionLM-9B.
+
+    e.g. [perception_lm-hf/perception_lm-9b](https://huggingface.co/perception_lm-hf/perception_lm-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+        vision_feature_layer (`Union[int, List[int]]`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the multimodal projector.
+
+    Example:
+
+    ```python
+    >>> from transformers import PerceptionLMForConditionalGeneration, PerceptionLMConfig, CLIPVisionConfig, LlamaConfig
+
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+
+    >>> # Initializing a PerceptionLM perception_lm-1.5-7b style configuration
+    >>> configuration = PerceptionLMConfig(vision_config, text_config)
+
+    >>> # Initializing a model from the perception_lm-1.5-7b style configuration
+    >>> model = PerceptionLMForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "perception_lm"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        image_seq_length=576,
+        multimodal_projector_bias=True,
+        **kwargs,
+    ):
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length
+
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(
+                "vision_feature_select_strategy should be one of 'default', 'full'."
+                f"Got: {vision_feature_select_strategy}"
+            )
+
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
+            )
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["llama"]()
+
+        self.text_config = text_config
+        self.multimodal_projector_bias = multimodal_projector_bias
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["PerceptionLMConfig"]
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
new file mode 100644
index 000000000000..cde3af02d8b8
--- /dev/null
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -0,0 +1,204 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import glob
+
+import torch
+from huggingface_hub import file_exists, hf_hub_download, snapshot_download
+from safetensors import safe_open
+
+from transformers import (
+    AddedToken,
+    AutoConfig,
+    AutoImageProcessor,
+    AutoTokenizer,
+    PerceptionLMConfig,
+    PerceptionLMForConditionalGeneration,
+    PerceptionLMProcessor,
+    SiglipVisionConfig,
+)
+
+
+EPILOG_TXT = """Example:
+    python transformers/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py --text_model_id lmsys/vicuna-7b-v1.5 --vision_model_id openai/clip-vit-large-patch14-336 --output_hub_path org/perception_lm-v1.5-7b-conv --old_state_dict_id liuhaotian/perception_lm-v1.5-7b
+
+Example for creating the old state dict file with Python:
+
+    import torch
+    from perception_lm.model.language_model.perception_lm_llama import PerceptionLMLlamaForCausalLM
+
+    # load model
+    kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
+    model = PerceptionLMLlamaForCausalLM.from_pretrained("liuhaotian/perception_lm-v1.5-7b", low_cpu_mem_usage=True, **kwargs)
+
+    # load vision tower
+    model.get_vision_tower().load_model()
+
+    # Save state dict
+    torch.save(model.state_dict(), "tmp/hf_models/perception_lm-v1.5-7b/model_state_dict.bin")
+"""
+
+KEYS_TO_MODIFY_MAPPING = {
+    "model.vision_tower.": "",
+    ".vision_resampler": "",  # all lmms-lab models do avg pooling, so no vision_resampler
+    "model.mm_projector": "multi_modal_projector",
+    "model": "model.model",
+    "vision_model.model": "vision_model",
+    "lm_head": "language_model.lm_head",
+    "model.model": "language_model.model",
+    "multi_modal_projector.0": "multi_modal_projector.linear_1",
+    "multi_modal_projector.2": "multi_modal_projector.linear_2",
+}
+
+
+def load_original_state_dict(model_id):
+    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
+
+    original_state_dict = {}
+    for path in glob.glob(f"{directory_path}/*"):
+        if path.endswith(".safetensors"):
+            with safe_open(path, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    original_state_dict[key] = f.get_tensor(key)
+
+    # tied wieghts so lm.head is not saved. Let's clone to load state dict
+    if "lm_head.weight" not in original_state_dict:
+        original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()
+
+    if "model.image_newline" in original_state_dict:
+        # not used in the original implementation because "merge_type=flat"
+        del original_state_dict["model.image_newline"]
+    return original_state_dict
+
+
+# used only for perception_lm-interlave
+# for ex: Qwen/Qwen1.5-0.5B-Chat google/siglip-so400m-patch14-384 lmms-lab/perception_lm-next-interleave-qwen-0.5b
+def convert_state_dict_to_hf(state_dict):
+    new_state_dict = {}
+    for key, value in state_dict.items():
+        if key.endswith(".inv_freq"):
+            continue
+        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+
+        new_state_dict[key] = value
+    return new_state_dict
+
+
+def convert_perception_lm_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
+    torch.set_default_dtype(torch.float16)
+    text_config = AutoConfig.from_pretrained(text_model_id)
+
+    tokenizer = AutoTokenizer.from_pretrained(text_model_id)
+    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
+    if "Qwen" not in text_model_id:  # qwen already has a pad token
+        tokenizer.add_special_tokens({"pad_token": "<pad>"})
+
+    image_processor = AutoImageProcessor.from_pretrained(vision_model_id)
+    processor = PerceptionLMProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+    if "siglip" in vision_model_id:
+        vision_config = SiglipVisionConfig(
+            hidden_size=1152,
+            image_size=384,
+            intermediate_size=4304,
+            num_attention_heads=16,
+            num_hidden_layers=26,
+            patch_size=14,
+            vision_use_head=False,
+        ).to_dict()
+    else:
+        vision_config = None
+
+    config = PerceptionLMConfig(
+        text_config=text_config,
+        vision_config=vision_config,
+    )
+
+    # llms-lab interleeave models do not use any selection startegy except for last hidden state
+    if "Qwen" in text_model_id:
+        config.image_token_id = 151646
+        if "siglip" in vision_model_id:
+            config.vision_feature_select_strategy = "full"
+            config.vision_feature_layer = -1
+    else:
+        config.pad_token_id = 32001
+        config.image_token_id = 32000
+
+    with torch.device("meta"):
+        model = PerceptionLMForConditionalGeneration(config)
+
+    # Some perception_lm variants like microsoft/perception_lm-med-v1.5-mistral-7b use safetensors to store weights
+    if file_exists(old_state_dict_id, "model_state_dict.bin"):
+        state_dict_path = hf_hub_download(old_state_dict_id, "model_state_dict.bin")
+        state_dict = torch.load(state_dict_path, map_location="cpu", weights_only=True)
+    else:
+        state_dict = load_original_state_dict(old_state_dict_id)
+
+    state_dict = convert_state_dict_to_hf(state_dict)
+    model.load_state_dict(state_dict, strict=True, assign=True)
+
+    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
+    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
+    n = pre_expansion_embeddings.size()[0]
+    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
+    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
+
+    # We add an image token so we resize the model and pad to 64 for performance reasons
+    pad_shape = 64
+    vocab_size = config.text_config.vocab_size
+    model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)
+    model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
+        tuple(
+            (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]))
+        ),
+        dim=0,
+    )
+    model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
+        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
+        dim=0,
+    )
+
+    model.push_to_hub(output_hub_path)
+    processor.push_to_hub(output_hub_path)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        epilog=EPILOG_TXT,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--text_model_id",
+        help="Hub location of the text model",
+    )
+    parser.add_argument(
+        "--vision_model_id",
+        help="Hub location of the vision model",
+    )
+    parser.add_argument(
+        "--output_hub_path",
+        help="Location on the hub of the converted model",
+    )
+    parser.add_argument(
+        "--old_state_dict_id",
+        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
+    )
+    args = parser.parse_args()
+    convert_perception_lm_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm.py b/src/transformers/models/perception_lm/image_processing_perception_lm.py
new file mode 100644
index 000000000000..c36d4f1b6c9d
--- /dev/null
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm.py
@@ -0,0 +1,435 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for PerceptionLM."""
+
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class PerceptionLMImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a PerceptionLM image processor.
+
+    Args:
+        do_pad (`bool`, *optional*, defaults to `False`):
+            Whether to pad the image to a square based on the longest edge.
+            The padding value is determined by the `image_mean` parameter.
+            Can be overridden by `do_pad` in the `preprocess` method.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_pad: bool = False,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_pad = do_pad
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+        self._valid_processor_keys = [
+            "images",
+            "do_pad",
+            "do_resize",
+            "size",
+            "resample",
+            "do_center_crop",
+            "crop_size",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_convert_rgb",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def pad_to_square(
+        self,
+        image: np.ndarray,
+        background_color: Union[int, Tuple[int, int, int]] = 0,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.array:
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            image (`np.ndarray`):
+                The image to pad.
+            background_color (`int` or `Tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in mutli-channel mode, it will default to `0` in subsequent channels.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use the inferred format of the input image.
+
+        Returns:
+            `np.ndarray`: The padded image.
+        """
+        height, width = get_image_size(image, input_data_format)
+        num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
+
+        if height == width:
+            image = (
+                to_channel_dimension_format(image, data_format, input_data_format)
+                if data_format is not None
+                else image
+            )
+            return image
+
+        max_dim = max(height, width)
+
+        # Ensure background_color is the correct shape
+        if isinstance(background_color, int):
+            background_color = [background_color]
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        if input_data_format == ChannelDimension.FIRST:
+            result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
+            for i, color in enumerate(background_color):
+                result[i, :, :] = color
+            if width > height:
+                start = (max_dim - height) // 2
+                result[:, start : start + height, :] = image
+            else:
+                start = (max_dim - width) // 2
+                result[:, :, start : start + width] = image
+        else:
+            result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
+            for i, color in enumerate(background_color):
+                result[:, :, i] = color
+            if width > height:
+                start = (max_dim - height) // 2
+                result[start : start + height, :, :] = image
+            else:
+                start = (max_dim - width) // 2
+                result[:, start : start + width, :] = image
+
+        image = (
+            to_channel_dimension_format(result, data_format, input_data_format) if data_format is not None else result
+        )
+        return image
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            default_to_square=default_to_square,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_pad: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image to a square based on the longest edge.
+                The padding value is determined by the `image_mean` parameter.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        # we don't pass `do_pad` here since PerceptionLM uses a custom padding to a square
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        processed_images = []
+        for image in images:
+            if do_pad:
+                image = self.pad_to_square(
+                    image=image,
+                    background_color=tuple(int(x * 255) for x in self.image_mean),
+                    input_data_format=input_data_format,
+                )
+
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_center_crop:
+                image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+
+__all__ = ["PerceptionLMImageProcessor"]
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
new file mode 100644
index 000000000000..ed70ab96005c
--- /dev/null
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -0,0 +1,201 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for PerceptionLM."""
+
+from typing import List, Optional, Tuple, Union
+
+from ...image_processing_utils import (
+    BatchFeature,
+)
+from ...image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+    get_image_size,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    is_vision_available,
+)
+
+
+if is_vision_available():
+    from ...image_utils import PILImageResampling
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    do_pad: Optional[bool]
+
+
+@add_start_docstrings(
+    "Constructs a fast PerceptionLM image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    """
+        do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+            Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
+    """,
+)
+class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"shortest_edge": 224}
+    default_to_square = False
+    crop_size = {"height": 224, "width": 224}
+    do_pad = False
+    do_resize = True
+    do_center_crop = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    valid_kwargs = PerceptionLMFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> None:
+        super().__init__(**kwargs)
+
+    @add_start_docstrings(
+        BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+        """
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
+        """,
+    )
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def pad_to_square(
+        self,
+        images: "torch.Tensor",
+        background_color: Union[int, Tuple[int, int, int]] = 0,
+    ) -> "torch.Tensor":
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            images (`np.ndarray`):
+                The images to pad.
+            background_color (`int` or `Tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in mutli-channel mode, it will default to `0` in subsequent channels.
+        Returns:
+            `torch.Tensor`: The padded images.
+        """
+        height, width = get_image_size(images, ChannelDimension.FIRST)
+
+        if height == width:
+            return images
+
+        num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0]
+        if isinstance(background_color, int):
+            background_color = [background_color] + [0] * (num_channels - 1)
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        max_dim = max(height, width)
+        paste_x_left = (max_dim - width) // 2
+        paste_y_left = (max_dim - height) // 2
+        paste_x_right = max_dim - width - paste_x_left
+        paste_y_right = max_dim - height - paste_y_left
+        padded_images = F.pad(
+            images, padding=[paste_x_left, paste_y_left, paste_x_right, paste_y_right], fill=background_color
+        )
+
+        return padded_images
+
+    def _preprocess(
+        self,
+        images: List["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_pad: bool,
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_pad:
+                stacked_images = self.pad_to_square(
+                    images=stacked_images, background_color=tuple(int(x * 255) for x in self.image_mean)
+                )
+            resized_images_grouped[shape] = stacked_images
+        padded_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for batched resizing
+        # Needed in case do_pad is False, or padding returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(padded_images)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+
+__all__ = ["PerceptionLMImageProcessorFast"]
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
new file mode 100644
index 000000000000..563ac0b33eb4
--- /dev/null
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -0,0 +1,501 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch PerceptionLM model."""
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...generation import GenerationMixin
+from ...modeling_outputs import ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torchdynamo_compiling,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_perception_lm import PerceptionLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "PerceptionLMConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/Perception-LM-1B"
+
+
+@dataclass
+# Copied from transformers.models.llava.modeling_llava.LlavaCausalLMOutputWithPast with Llava->PerceptionLM
+class PerceptionLMCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for PerceptionLM causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->PerceptionLM
+class PerceptionLMMultiModalProjector(nn.Module):
+    def __init__(self, config: PerceptionLMConfig):
+        super().__init__()
+        # We have hidden_size * the number of vision feature layers
+        num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * num_feature_layers,
+            config.text_config.hidden_size,
+            bias=config.multimodal_projector_bias,
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+PERCEPTION_LM_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`PerceptionLMConfig`] or [`PerceptionLMVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    PERCEPTION_LM_START_DOCSTRING,
+)
+# Copied from transformers.models.llava.modeling_llava.LlavaPreTrainedModel with Llava->PerceptionLM,llava->perception_lm
+class PerceptionLMPreTrainedModel(PreTrainedModel):
+    config_class = PerceptionLMConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PerceptionLMVisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        # important: this ported version of PerceptionLM isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+        # https://github.com/haotian-liu/LLaVA/tree/main/perception_lm should serve for that purpose
+        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)
+
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+PERCEPTION_LM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`PerceptionLMProcessor`] uses
+            [`CLIPImageProcessor`] for processing images).
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    """The PERCEPTION_LM model which consists of a vision backbone and a language model.""",
+    PERCEPTION_LM_START_DOCSTRING,
+)
+# Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration with LLAVA->PERCEPTION_LM,Llava->PerceptionLM,llava-hf/llava-1.5-7b-hf->facebook/Perception-LM-1B
+class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin):
+    def __init__(self, config: PerceptionLMConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Union[int, List[int]],
+        vision_feature_select_strategy: str,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+            vision_feature_layer (`Union[int, List[int]]`):
+                The index of the layer to select the vision feature. If multiple indices are provided,
+                the vision feature of the corresponding indices will be concatenated to form the
+                vision features.
+            vision_feature_select_strategy (`str`):
+                The feature selection strategy used to select the vision feature from the vision backbone.
+                Can be one of `"default"` or `"full"`
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True, **kwargs)
+
+        # If we have one vision feature layer, return the corresponding hidden states,
+        # otherwise, select the hidden states of each feature layer and concatenate them
+        if isinstance(vision_feature_layer, int):
+            selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+            if vision_feature_select_strategy == "default":
+                selected_image_feature = selected_image_feature[:, 1:]
+        else:
+            hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
+            # For default; crop CLS from each hidden state in the hidden state pool
+            if vision_feature_select_strategy == "default":
+                hs_pool = [hs[:, 1:] for hs in hs_pool]
+            selected_image_feature = torch.cat(hs_pool, dim=-1)
+
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features
+
+    @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, List[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **lm_kwargs,
+    ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, PerceptionLMForConditionalGeneration
+
+        >>> model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B")
+        >>> processor = AutoProcessor.from_pretrained("facebook/Perception-LM-1B")
+
+        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+                image_sizes=image_sizes,
+            )
+
+            special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
+                n_image_tokens = (input_ids == self.config.image_token_id).sum()
+                n_image_features = image_features.shape[0] * image_features.shape[1]
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+
+        logits = outputs[0]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return PerceptionLMCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+
+__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel"]
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
new file mode 100644
index 000000000000..cb989af91830
--- /dev/null
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -0,0 +1,202 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for PerceptionLM.
+"""
+
+from typing import List, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class PerceptionLMProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {},
+    }
+
+
+class PerceptionLMProcessor(ProcessorMixin):
+    r"""
+    Constructs a PerceptionLM processor which wraps a PerceptionLM image processor and a LLaMa tokenizer into a single processor.
+
+    [`PerceptionLMProcessor`] offers all the functionalities of [`PerceptionLMImageProcessor`] and [`PerceptionLMTokenizerFast`]. See the
+    [`~PerceptionLMProcessor.__call__`] and [`~PerceptionLMProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`PerceptionLMImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`PerceptionLMTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Shoudl be same as in model's config
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = [
+        "chat_template",
+        "patch_size",
+        "vision_feature_select_strategy",
+        "image_token",
+        "num_additional_image_tokens",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        chat_template=None,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        num_additional_image_tokens=0,
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.num_additional_image_tokens = num_additional_image_tokens
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[PerceptionLMProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to PerceptionLMTokenizerFast's [`~PerceptionLMTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify at least one of `images` or `text`.")
+
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+
+        output_kwargs = self._merge_kwargs(
+            PerceptionLMProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        # try to expand inputs in processing if we have the necessary parts
+        prompt_strings = text
+        if image_inputs.get("pixel_values") is not None:
+            # Replace the image token with the expanded image token sequence
+            pixel_values = image_inputs["pixel_values"]
+            height, width = get_image_size(to_numpy_array(pixel_values[0]))
+            num_image_tokens = (height // self.patch_size) * (
+                width // self.patch_size
+            ) + self.num_additional_image_tokens
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                prompt_strings.append(sample)
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PerceptionLMTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PerceptionLMTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["PerceptionLMProcessor"]
diff --git a/tests/models/perception_lm/__init__.py b/tests/models/perception_lm/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/perception_lm/test_image_processing_perception_lm.py b/tests/models/perception_lm/test_image_processing_perception_lm.py
new file mode 100644
index 000000000000..ec5663439472
--- /dev/null
+++ b/tests/models/perception_lm/test_image_processing_perception_lm.py
@@ -0,0 +1,236 @@
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+from typing import Union
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torchvision_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import PerceptionLMImageProcessor
+
+    if is_torchvision_available():
+        from torchvision.transforms import functional as F
+
+        from transformers import PerceptionLMImageProcessorFast
+
+
+class PerceptionLMImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_pad=True,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        super().__init__()
+        size = size if size is not None else {"shortest_edge": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_pad = do_pad
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_pad": self.do_pad,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest with CLIP->PerceptionLM
+class PerceptionLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = PerceptionLMImageProcessor if is_vision_available() else None
+    fast_image_processing_class = PerceptionLMImageProcessorFast if is_torchvision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = PerceptionLMImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    # Ignore copy
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "do_pad"))
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "size"))
+            self.assertTrue(hasattr(image_processing, "do_center_crop"))
+            self.assertTrue(hasattr(image_processing, "center_crop"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.size, {"shortest_edge": 20})
+            self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+            image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+            self.assertEqual(image_processor.size, {"shortest_edge": 42})
+            self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+
+    # Ignore copy
+    def test_padding(self):
+        """
+        LLaVA needs to pad images to square size before processing as per orig implementation.
+        Checks that image processor pads images correctly given different background colors.
+        """
+
+        # taken from original implementation: https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/perception_lm/mm_utils.py#L152
+        def pad_to_square_original(
+            image: Image.Image, background_color: Union[int, tuple[int, int, int]] = 0
+        ) -> Image.Image:
+            width, height = image.size
+            if width == height:
+                return image
+            elif width > height:
+                result = Image.new(image.mode, (width, width), background_color)
+                result.paste(image, (0, (width - height) // 2))
+                return result
+            else:
+                result = Image.new(image.mode, (height, height), background_color)
+                result.paste(image, ((height - width) // 2, 0))
+                return result
+
+        for i, image_processing_class in enumerate(self.image_processor_list):
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            numpify = i == 0
+            torchify = i == 1
+            image_inputs = self.image_processor_tester.prepare_image_inputs(
+                equal_resolution=False, numpify=numpify, torchify=torchify
+            )
+
+            # test with images in channel-last and channel-first format (only channel-first for torch)
+            for image in image_inputs:
+                padded_image = image_processor.pad_to_square(image)
+                if i == 0:
+                    padded_image_original = pad_to_square_original(Image.fromarray(image))
+                    padded_image_original = np.array(padded_image_original)
+
+                    np.testing.assert_allclose(padded_image, padded_image_original)
+
+                    padded_image = image_processor.pad_to_square(
+                        image.transpose(2, 0, 1), input_data_format="channels_first"
+                    )
+                    padded_image = padded_image.transpose(1, 2, 0)
+
+                    np.testing.assert_allclose(padded_image, padded_image_original)
+                else:
+                    padded_image_original = pad_to_square_original(F.to_pil_image(image))
+                    padded_image = padded_image.permute(1, 2, 0)
+                    np.testing.assert_allclose(padded_image, padded_image_original)
+
+            # test background color
+            background_color = (122, 116, 104)
+            for image in image_inputs:
+                padded_image = image_processor.pad_to_square(image, background_color=background_color)
+                if i == 0:
+                    padded_image_original = pad_to_square_original(
+                        Image.fromarray(image), background_color=background_color
+                    )
+                else:
+                    padded_image_original = pad_to_square_original(
+                        F.to_pil_image(image), background_color=background_color
+                    )
+                    padded_image = padded_image.permute(1, 2, 0)
+                padded_image_original = np.array(padded_image_original)
+
+                np.testing.assert_allclose(padded_image, padded_image_original)
+
+            background_color = 122
+            for image in image_inputs:
+                padded_image = image_processor.pad_to_square(image, background_color=background_color)
+                if i == 0:
+                    padded_image_original = pad_to_square_original(
+                        Image.fromarray(image), background_color=background_color
+                    )
+                else:
+                    padded_image_original = pad_to_square_original(
+                        F.to_pil_image(image), background_color=background_color
+                    )
+                    padded_image = padded_image.permute(1, 2, 0)
+                padded_image_original = np.array(padded_image_original)
+                np.testing.assert_allclose(padded_image, padded_image_original)
+
+            # background color length should match channel length
+            with self.assertRaises(ValueError):
+                padded_image = image_processor.pad_to_square(image_inputs[0], background_color=(122, 104))
+
+            with self.assertRaises(ValueError):
+                padded_image = image_processor.pad_to_square(image_inputs[0], background_color=(122, 104, 0, 0))
+
+    @unittest.skip(reason="PerceptionLM does not support 4 channel images yet")
+    # Ignore copy
+    def test_call_numpy_4_channels(self):
+        pass
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
new file mode 100644
index 000000000000..ed3e77a84a34
--- /dev/null
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -0,0 +1,625 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch PerceptionLM model."""
+
+import unittest
+
+import requests
+from parameterized import parameterized
+
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    PerceptionLMConfig,
+    PerceptionLMForConditionalGeneration,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import (
+    cleanup,
+    require_bitsandbytes,
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class PerceptionLMVisionText2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        ignore_index=-100,
+        image_token_index=0,
+        projector_hidden_act="gelu",
+        seq_length=7,
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-1,
+        text_config={
+            "model_type": "llama",
+            "seq_length": 7,
+            "is_training": True,
+            "use_input_mask": True,
+            "use_token_type_ids": False,
+            "use_labels": True,
+            "vocab_size": 99,
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "attention_probs_dropout_prob": 0.1,
+            "max_position_embeddings": 512,
+            "type_vocab_size": 16,
+            "type_sequence_label_size": 2,
+            "initializer_range": 0.02,
+            "num_labels": 3,
+            "num_choices": 4,
+            "pad_token_id": 1,
+        },
+        is_training=True,
+        vision_config={
+            "image_size": 8,
+            "patch_size": 2,
+            "num_channels": 3,
+            "is_training": True,
+            "hidden_size": 32,
+            "projection_dim": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "dropout": 0.1,
+            "attention_dropout": 0.1,
+            "initializer_range": 0.02,
+        },
+    ):
+        self.parent = parent
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.pad_token_id = text_config["pad_token_id"]
+
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.vocab_size = text_config["vocab_size"]
+        self.hidden_size = text_config["hidden_size"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.is_training = is_training
+
+        self.batch_size = 3
+        self.num_channels = 3
+        self.image_size = 336
+        self.num_image_tokens = (self.vision_config["image_size"] // self.vision_config["patch_size"]) ** 2
+        self.seq_length = seq_length + self.num_image_tokens
+        self.encoder_seq_length = self.seq_length
+
+    def get_config(self):
+        return PerceptionLMConfig(
+            text_config=self.text_config,
+            vision_config=self.vision_config,
+            ignore_index=self.ignore_index,
+            image_token_index=self.image_token_index,
+            projector_hidden_act=self.projector_hidden_act,
+            vision_feature_select_strategy=self.vision_feature_select_strategy,
+            vision_feature_layer=self.vision_feature_layer,
+            image_seq_length=self.num_image_tokens,
+        )
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                self.vision_config["num_channels"],
+                self.vision_config["image_size"],
+                self.vision_config["image_size"],
+            ]
+        )
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
+        attention_mask = input_ids.ne(1).to(torch_device)
+        input_ids[input_ids == config.image_token_index] = self.pad_token_id
+        input_ids[:, : self.num_image_tokens] = config.image_token_index
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class PerceptionLMForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    """
+    Model tester for `PerceptionLMForConditionalGeneration`.
+    """
+
+    all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
+    test_pruning = False
+    test_head_masking = False
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = PerceptionLMVisionText2TextModelTester(self)
+        common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"]
+        self.config_tester = ConfigTester(
+            self, config_class=PerceptionLMConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            torch.testing.assert_close(out_embeds, out_ids)
+
+    def test_mismatching_num_image_tokens(self):
+        """
+        Tests that VLMs through an error with explicit message saying what is wrong
+        when number of images doesn't match number of image tokens in the text.
+        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            _ = model(**input_dict)  # successful forward with no modifications
+
+            # remove one image but leave the image token in text
+            input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(**input_dict)
+
+            # simulate multi-image case by concatenating inputs where each has exactly one image/image-token
+            input_ids = input_dict["input_ids"][:1]
+            pixel_values = input_dict["pixel_values"][:1]
+            input_ids = torch.cat([input_ids, input_ids], dim=0)
+
+            # one image and two image tokens raise an error
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, pixel_values=pixel_values)
+
+            # two images and two image tokens don't raise an error
+            pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
+            _ = model(input_ids=input_ids, pixel_values=pixel_values)
+
+    @parameterized.expand(
+        [
+            (-1,),
+            ([-1],),
+            ([-1, -2],),
+        ],
+    )
+    def test_vision_feature_layers(self, vision_feature_layer):
+        """
+        Test that we can use either one vision feature layer, or a list of
+        vision feature layers.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.vision_feature_layer = vision_feature_layer
+
+        num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer)
+        hidden_size = config.vision_config.hidden_size
+        expected_features = hidden_size * num_feature_layers
+
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            # We should have the right number of input features,
+            # and should be able to run a forward pass without exploding
+            assert model.multi_modal_projector.linear_1.in_features == expected_features
+            model(**input_dict)
+
+    @unittest.skip(
+        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(
+        "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test"
+    )
+    def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
+        pass
+
+
+@require_torch
+class PerceptionLMForConditionalGenerationIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.processor = AutoProcessor.from_pretrained("perception_lm-hf/bakPerceptionLM-v1-hf")
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test(self):
+        # Let's make sure we test the preprocessing to replace what is used
+        model = PerceptionLMForConditionalGeneration.from_pretrained("perception_lm-hf/bakPerceptionLM-v1-hf", load_in_4bit=True)
+
+        prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
+        image_file = "https://perception_lm-vl.github.io/static/images/view.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20)
+        EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,"  # fmt: skip
+
+        self.assertEqual(
+            self.processor.decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_llama_single(self):
+        # Let's make sure we test the preprocessing to replace what is used
+        model_id = "facebook/Perception-LM-1B"
+
+        model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B", load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place? ASSISTANT:"
+        image_file = "https://perception_lm-vl.github.io/static/images/view.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
+
+        output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
+        EXPECTED_DECODED_TEXT = "USER:  \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area."  # fmt: skip
+
+        self.assertEqual(
+            processor.decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_llama_batched(self):
+        # Let's make sure we test the preprocessing to replace what is used
+        model_id = "facebook/Perception-LM-1B"
+
+        model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B", load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompts = [
+            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT:",
+            "USER: <image>\nWhat is this? ASSISTANT:",
+        ]
+        image1 = Image.open(requests.get("https://perception_lm-vl.github.io/static/images/view.jpg", stream=True).raw)
+        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, you', 'USER:  \nWhat is this? ASSISTANT: The image features two cats lying down on a pink couch. One cat is located on']  # fmt: skip
+
+        self.assertEqual(
+            processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_batch(self):
+        # Let's make sure we test the preprocessing to replace what is used
+        model = PerceptionLMForConditionalGeneration.from_pretrained("perception_lm-hf/bakPerceptionLM-v1-hf", load_in_4bit=True)
+        # The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
+        prompts = [
+            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
+            "USER: <image>\nWhat is this?\nASSISTANT:",
+        ]
+        image1 = Image.open(requests.get("https://perception_lm-vl.github.io/static/images/view.jpg", stream=True).raw)
+        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(
+            torch_device
+        )
+
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = [
+            'USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring.',
+            'USER:  \nWhat is this?\nASSISTANT: Cats'
+        ]  # fmt: skip
+        self.assertEqual(
+            self.processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_llama_batched_regression(self):
+        # Let's make sure we test the preprocessing to replace what is used
+        model_id = "facebook/Perception-LM-1B"
+
+        # Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
+        model = PerceptionLMForConditionalGeneration.from_pretrained(
+            "facebook/Perception-LM-1B", load_in_4bit=True, attn_implementation="eager"
+        )
+        processor = AutoProcessor.from_pretrained(model_id, pad_token="<pad>")
+
+        prompts = [
+            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
+            "USER: <image>\nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: <image>\nAnd this?\nASSISTANT:",
+        ]
+        image1 = Image.open(requests.get("https://perception_lm-vl.github.io/static/images/view.jpg", stream=True).raw)
+        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True).to(
+            torch_device
+        )
+
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a body of water', 'USER:  \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER:  \nAnd this?\nASSISTANT: A cat sleeping on a bed.']  # fmt: skip
+
+        self.assertEqual(
+            processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @require_torch
+    @require_vision
+    def test_batched_generation(self):
+        model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B", load_in_4bit=True)
+
+        processor = AutoProcessor.from_pretrained("facebook/Perception-LM-1B")
+
+        prompt1 = "<image>\n<image>\nUSER: What's the difference of two images?\nASSISTANT:"
+        prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
+        prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
+        url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
+        url2 = "https://images.unsplash.com/photo-1617258683320-61900b281ced?q=80&w=3087&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
+        image1 = Image.open(requests.get(url1, stream=True).raw)
+        image2 = Image.open(requests.get(url2, stream=True).raw)
+
+        inputs = processor(
+            images=[image1, image2, image1, image2],
+            text=[prompt1, prompt2, prompt3],
+            return_tensors="pt",
+            padding=True,
+        ).to(torch_device)
+
+        model = model.eval()
+
+        EXPECTED_OUTPUT = [
+            "\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
+            "\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
+            "\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
+        ]
+
+        generate_ids = model.generate(**inputs, max_new_tokens=20)
+        outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        self.assertEqual(outputs, EXPECTED_OUTPUT)
+
+    def test_tokenizer_integration(self):
+        slow_tokenizer = AutoTokenizer.from_pretrained("liuhaotian/perception_lm-v1.6-34b", use_fast=False)
+        slow_tokenizer.add_tokens("<image>", True)
+
+        fast_tokenizer = AutoTokenizer.from_pretrained(
+            "liuhaotian/perception_lm-v1.6-34b",
+            bos_token="<|startoftext|>",
+            eos_token="<|endoftext|>",
+            from_slow=True,
+            legacy=False,
+        )
+        fast_tokenizer.add_tokens("<image>", True)
+
+        prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
+        EXPECTED_OUTPUT = ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '<image>', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n']  # fmt: skip
+        self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
+        self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
+
+    @slow
+    @require_bitsandbytes
+    def test_generation_no_images(self):
+        model_id = "facebook/Perception-LM-1B"
+        model = PerceptionLMForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        # Prepare inputs with no images
+        inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device)
+
+        # Make sure that `generate` works
+        _ = model.generate(**inputs, max_new_tokens=20)
+
+    @slow
+    @require_bitsandbytes
+    def test_generation_siglip_backbone(self):
+        model_id = "perception_lm-hf/perception_lm-interleave-qwen-0.5b-hf"
+        model = PerceptionLMForConditionalGeneration.from_pretrained(model_id, torch_dtype="float16", device_map=torch_device)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        # check processing with expansion of inputs (w/o expansion should work with any backbone)
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = processor(
+            text="<|im_start|>user\n<image>\nWhat are these?<|im_end|>\n<|im_start|>assistant",
+            images=raw_image,
+            return_tensors="pt",
+        ).to(torch_device, torch.float16)
+
+        # Make sure that `generate` works
+        output = model.generate(**inputs, max_new_tokens=30)
+
+        EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat"
+        self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT)
+
+    @slow
+    def test_pixtral(self):
+        model_id = "mistral-community/pixtral-12b"
+        model = PerceptionLMForConditionalGeneration.from_pretrained(model_id)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        IMG_URLS = [
+            Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw),
+        ]
+        PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
+
+        # image = Image.open(requests.get(url, stream=True).raw)
+        inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(model.device)
+        generate_ids = model.generate(**inputs, max_new_tokens=500)
+        ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        print(ouptut)
+
+        # fmt: off
+        EXPECTED_GENERATION = """
+Describe the images.
+Certainly! Here are the descriptions of the images:
+
+1. **Image 1**: This image features a black dog with a glossy coat sitting on a wooden surface. The dog has a calm and attentive expression, looking directly at the camera. The wooden background has a rustic appearance with visible grain and texture.
+
+2. **Image 2**: This image captures a breathtaking view of a mountainous landscape. The mountains are rugged and covered with patches of green vegetation. The sky above is clear, and the scene conveys a sense of tranquility and natural beauty.
+
+3. **Image 3**: This image shows a beach scene during sunset. The waves are gently rolling onto the shore, and several people can be seen in the water, possibly surfing or swimming. The sky is painted with warm hues of orange and yellow, creating a serene and picturesque atmosphere.
+
+4. **Image 4**: This image depicts a narrow, winding path that cuts through a lush, green landscape. On either side of the path, there is dense grass and various trees, including a prominent tree with white blossoms. The sky is clear and blue, adding to the peaceful and inviting ambiance of the scene.
+
+These descriptions provide a detailed overview of the content and atmosphere of each image.
+"""
+        # fmt: on
+        # check that both inputs are handled correctly and generate the same output
+        self.assertEqual(ouptut, EXPECTED_GENERATION)
+
+    @slow
+    @require_bitsandbytes
+    def test_pixtral_4bit(self):
+        model_id = "mistral-community/pixtral-12b"
+        model = PerceptionLMForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        IMG_URLS = [
+            Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
+        ]
+        PROMPT = "<s>[INST][IMG][IMG]Describe the images.[/INST]"
+
+        inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(torch_device, torch.float16)
+        generate_ids = model.generate(**inputs, max_new_tokens=50)
+        output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+        EXPECTED_GENERATION = [
+            # CUDA output
+            "Describe the images. The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador",
+            # XPU output
+            "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which covers the entire background. The dog appears to be the main focus",
+        ]  # fmt: skip
+        self.assertTrue(output in EXPECTED_GENERATION)
+
+    @slow
+    @require_bitsandbytes
+    def test_pixtral_batched(self):
+        model_id = "mistral-community/pixtral-12b"
+        model = PerceptionLMForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+        processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
+
+        IMG_URLS = [
+            Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/17/150/500", stream=True).raw),
+        ]
+        PROMPT = [
+            "<s>[INST][IMG]What breed is the dog?[/INST]",
+            "<s>[INST][IMG]What is shown in this image?[/INST]",
+        ]
+
+        inputs = processor(text=PROMPT, images=IMG_URLS, padding=True, return_tensors="pt").to(
+            torch_device, torch.float16
+        )
+        generate_ids = model.generate(**inputs, max_new_tokens=50)
+        output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+
+        EXPECTED_GENERATION = [
+            'What breed is the dog?The dog in the image is a black Labrador Retriever.',
+            'What is shown in this image?The image depicts a narrow, winding dirt path surrounded by lush greenery. The path is flanked by grass and shrubs on both sides. On the left side, there are tall trees and dense foliage, while on the right side, there'
+        ]  # fmt: skip
+        self.assertEqual(output, EXPECTED_GENERATION)
diff --git a/tests/models/perception_lm/test_processor_perception_lm.py b/tests/models/perception_lm/test_processor_perception_lm.py
new file mode 100644
index 000000000000..c9c3b47f6593
--- /dev/null
+++ b/tests/models/perception_lm/test_processor_perception_lm.py
@@ -0,0 +1,108 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import shutil
+import tempfile
+import unittest
+
+from transformers import AutoProcessor, AutoTokenizer, PerceptionLMTokenizerFast, PerceptionLMProcessor
+from transformers.testing_utils import require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from transformers import CLIPImageProcessor
+
+if is_torch_available:
+    pass
+
+
+@require_vision
+class PerceptionLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = PerceptionLMProcessor
+
+    @classmethod
+    def setUpClass(cls):
+        cls.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = CLIPImageProcessor(do_center_crop=False)
+        tokenizer = PerceptionLMTokenizerFast.from_pretrained("huggyllama/llama-7b")
+        tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
+        processor_kwargs = cls.prepare_processor_dict()
+        processor = PerceptionLMProcessor(image_processor, tokenizer, **processor_kwargs)
+        processor.save_pretrained(cls.tmpdirname)
+        cls.image_token = processor.image_token
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.tmpdirname, ignore_errors=True)
+
+    @staticmethod
+    def prepare_processor_dict():
+        return {
+            "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
+            "patch_size": 128,
+            "vision_feature_select_strategy": "default"
+        }  # fmt: skip
+
+    def test_chat_template_is_saved(self):
+        processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
+        processor_dict_loaded = json.loads(processor_loaded.to_json_string())
+        # chat templates aren't serialized to json in processors
+        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+
+        # they have to be saved as separate file and loaded back from that file
+        # so we check if the same template is loaded
+        processor_dict = self.prepare_processor_dict()
+        self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
+
+    def test_can_load_various_tokenizers(self):
+        for checkpoint in ["Intel/perception_lm-gemma-2b", "facebook/Perception-LM-1B"]:
+            processor = PerceptionLMProcessor.from_pretrained(checkpoint)
+            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+            self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)
+
+    def test_special_mm_token_truncation(self):
+        """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
+
+        processor = PerceptionLMProcessor.from_pretrained("facebook/Perception-LM-1B")
+
+        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
+        image_input = self.prepare_image_inputs(batch_size=2)
+
+        _ = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            truncation=None,
+            padding=True,
+        )
+
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                images=image_input,
+                return_tensors="pt",
+                truncation=True,
+                padding=True,
+                max_length=5,
+            )

From 259a8ec27738dc311ca44cd6ef9d693995b956fe Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Sat, 26 Apr 2025 06:13:25 +0000
Subject: [PATCH 02/65] A working plm with fixed image features

---
 .../configuration_perception_lm.py            |   4 +-
 .../convert_perception_lm_weights_to_hf.py    | 621 ++++++++++++++----
 .../perception_lm/modeling_perception_lm.py   | 191 ++++--
 3 files changed, 639 insertions(+), 177 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index 79b3cce8549f..38ac3462f57f 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -84,6 +84,7 @@ def __init__(
         self,
         vision_config=None,
         text_config=None,
+        projector_pooling_ratio=1,
         image_token_index=32000,
         projector_hidden_act="gelu",
         vision_feature_select_strategy="default",
@@ -131,8 +132,7 @@ def __init__(
             text_config = CONFIG_MAPPING["llama"]()
 
         self.text_config = text_config
-        self.multimodal_projector_bias = multimodal_projector_bias
-
+        self.projector_pooling_ratio = projector_pooling_ratio
         super().__init__(**kwargs)
 
 
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index cde3af02d8b8..57634be620bd 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,192 +12,549 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import glob
+import gc
+import json
+import os
+import tempfile
+import warnings
+from typing import List
 
 import torch
-from huggingface_hub import file_exists, hf_hub_download, snapshot_download
-from safetensors import safe_open
+from tokenizers import AddedToken, processors
 
 from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoImageProcessor,
-    AutoTokenizer,
+    CLIPVisionConfig,
+    GenerationConfig,
+    LlamaConfig,
+    LlamaForCausalLM,
+    LlamaTokenizer,
+    PreTrainedTokenizerFast,
+    SiglipVisionConfig,
+)
+from transformers.convert_slow_tokenizer import TikTokenConverter
+from transformers.models.perception_lm.configuration_perception_lm import (
     PerceptionLMConfig,
+)
+from transformers.models.perception_lm.modeling_perception_lm import (
     PerceptionLMForConditionalGeneration,
-    PerceptionLMProcessor,
-    SiglipVisionConfig,
 )
 
 
-EPILOG_TXT = """Example:
-    python transformers/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py --text_model_id lmsys/vicuna-7b-v1.5 --vision_model_id openai/clip-vit-large-patch14-336 --output_hub_path org/perception_lm-v1.5-7b-conv --old_state_dict_id liuhaotian/perception_lm-v1.5-7b
-
-Example for creating the old state dict file with Python:
 
-    import torch
-    from perception_lm.model.language_model.perception_lm_llama import PerceptionLMLlamaForCausalLM
-
-    # load model
-    kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
-    model = PerceptionLMLlamaForCausalLM.from_pretrained("liuhaotian/perception_lm-v1.5-7b", low_cpu_mem_usage=True, **kwargs)
+try:
+    from transformers import LlamaTokenizerFast
+except ImportError as e:
+    warnings.warn(e)
+    warnings.warn(
+        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
+    )
+    LlamaTokenizerFast = None
 
-    # load vision tower
-    model.get_vision_tower().load_model()
+"""
+Sample usage:
+
+```
+python src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py \
+    --input_dir /path/to/downloaded/perception_lm/model_path  --output_dir /output/path
+```
+
+Thereafter, models can be loaded via:
+
+```py
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+model = LlamaForCausalLM.from_pretrained("/output/path")
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+```
+
+Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+
+If you want your tokenizer to add a bos automatically you should update the tokenizer._tokenizers.post_processor:
+
+```py
+from tokenizers import processors
+bos = "<|begin_of_text|>"
+tokenizer._tokenizers.post_processor = processors.Sequence(
+    [
+        processors.ByteLevel(trim_offsets=False),
+        processors.TemplateProcessing(
+            single=f"{bos}:0 $A:0",
+            pair=f"{bos}:0 $A:0 {bos}:1 $B:1",
+            special_tokens=[
+                (bos, tokenizer.encode(bos)),
+            ],
+        ),
+    ]
+)
+```
 
-    # Save state dict
-    torch.save(model.state_dict(), "tmp/hf_models/perception_lm-v1.5-7b/model_state_dict.bin")
 """
 
 KEYS_TO_MODIFY_MAPPING = {
     "model.vision_tower.": "",
-    ".vision_resampler": "",  # all lmms-lab models do avg pooling, so no vision_resampler
     "model.mm_projector": "multi_modal_projector",
     "model": "model.model",
     "vision_model.model": "vision_model",
     "lm_head": "language_model.lm_head",
     "model.model": "language_model.model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
+    "language_model.model.image_newline": "image_newline",
 }
 
+BOS_ADDED_TOKEN = AddedToken(
+    "<|begin_of_text|>",
+    single_word=False,
+    lstrip=False,
+    rstrip=False,
+    normalized=False,
+    special=True,
+)
+EOS_ADDED_TOKEN = AddedToken(
+    "<|end_of_text|>",
+    single_word=False,
+    lstrip=False,
+    rstrip=False,
+    normalized=False,
+    special=True,
+)
+EOT_ADDED_TOKEN = AddedToken(
+    "<|eot_id|>",
+    single_word=False,
+    lstrip=False,
+    rstrip=False,
+    normalized=False,
+    special=True,
+)
 
-def load_original_state_dict(model_id):
-    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    # tied wieghts so lm.head is not saved. Let's clone to load state dict
-    if "lm_head.weight" not in original_state_dict:
-        original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()
-
-    if "model.image_newline" in original_state_dict:
-        # not used in the original implementation because "merge_type=flat"
-        del original_state_dict["model.image_newline"]
-    return original_state_dict
+DEFAULT_SPECIAL_TOKENS = {
+    "perception_lm": [
+        "<|begin_of_text|>",
+        "<|end_of_text|>",
+        "<|image|>",
+        "<|reserved_special_token_1|>",
+        "<|reserved_special_token_2|>",
+        "<|reserved_special_token_3|>",
+        "<|start_header_id|>",
+        "<|end_header_id|>",
+        "<|reserved_special_token_4|>",
+        "<|eot_id|>",  # End of turn
+    ]
+    + [f"<|reserved_special_token_{i}|>" for i in range(5, 256 - 5)]
+}
 
 
-# used only for perception_lm-interlave
-# for ex: Qwen/Qwen1.5-0.5B-Chat google/siglip-so400m-patch14-384 lmms-lab/perception_lm-next-interleave-qwen-0.5b
 def convert_state_dict_to_hf(state_dict):
     new_state_dict = {}
     for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
         for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
             if key_to_modify in key:
                 key = key.replace(key_to_modify, new_key)
 
-        new_state_dict[key] = value
+        new_state_dict[key] = value.to(torch.float16)
     return new_state_dict
 
 
-def convert_perception_lm_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
-    torch.set_default_dtype(torch.float16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-    if "Qwen" not in text_model_id:  # qwen already has a pad token
-        tokenizer.add_special_tokens({"pad_token": "<pad>"})
+def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
+    return multiple_of * (
+        (int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of
+    )
 
-    image_processor = AutoImageProcessor.from_pretrained(vision_model_id)
-    processor = PerceptionLMProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
-    if "siglip" in vision_model_id:
-        vision_config = SiglipVisionConfig(
-            hidden_size=1152,
-            image_size=384,
-            intermediate_size=4304,
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+
+
+def write_model(
+    model_path,
+    input_base_path,
+    params,
+    image_token_id,
+    safe_serialization=True,
+    vocab_size=None,
+    num_shards=None,
+    push_to_hub=False,
+):
+    print("Converting the model.")
+    num_shards = 1
+    model_params = params.get("model", params)
+    n_layers = model_params["n_layers"]
+    n_heads = model_params["n_heads"]
+    n_heads_per_shard = n_heads // num_shards
+    dim = model_params["dim"]
+    dims_per_head = dim // n_heads
+    base = model_params.get("rope_theta", 10000.0)
+    inv_freq = 1.0 / (
+        base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
+    )
+    context_length = model_params["max_seqlen"]
+    max_position_embeddings = context_length
+    tie_word_embeddings = model_params.get("weight_tying", False)
+    projector_pooling_ratio = model_params.get("pooling_ratio", 1)
+
+    if model_params.get("n_kv_heads", None) is not None:
+        num_key_value_heads = model_params["n_kv_heads"]  # for GQA / MQA
+        num_key_value_heads_per_shard = num_key_value_heads // num_shards
+        key_value_dim = dims_per_head * num_key_value_heads
+    else:  # compatibility with other checkpoints
+        num_key_value_heads = n_heads
+        num_key_value_heads_per_shard = n_heads_per_shard
+        key_value_dim = dim
+
+    # permute for sliced rotary
+    def permute(w, n_heads, dim1=dim, dim2=dim):
+        return (
+            w.view(n_heads, dim1 // n_heads // 2, 2, dim2)
+            .transpose(1, 2)
+            .reshape(dim1, dim2)
+        )
+
+    with tempfile.TemporaryDirectory() as tmp_model_path:
+        print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
+        # Load weights
+        if num_shards == 1:
+            # Not sharded
+            # (The sharded implementation would also work, but this is simpler.)
+            loaded = torch.load(
+                os.path.join(input_base_path, "consolidated.pth"),
+                map_location="cpu",
+                weights_only=True,
+            )
+        else:
+            # Sharded
+            checkpoint_list = sorted(
+                [file for file in os.listdir(input_base_path) if file.endswith(".pth")]
+            )
+            print("Loading in order:", checkpoint_list)
+            loaded = [
+                torch.load(
+                    os.path.join(input_base_path, file),
+                    map_location="cpu",
+                    weights_only=True,
+                )
+                for file in checkpoint_list
+            ]
+        param_count = 0
+        index_dict = {"weight_map": {}}
+        for layer_i in range(n_layers):
+            filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
+            assert num_shards == 1, "PerceptionLM does not support sharded weights"
+            state_dict = {
+                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
+                    loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads
+                ),
+                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
+                    loaded[f"layers.{layer_i}.attention.wk.weight"],
+                    n_heads=num_key_value_heads,
+                    dim1=key_value_dim,
+                ),
+                f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[
+                    f"layers.{layer_i}.attention.wv.weight"
+                ],
+                f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[
+                    f"layers.{layer_i}.attention.wo.weight"
+                ],
+                f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[
+                    f"layers.{layer_i}.feed_forward.w1.weight"
+                ],
+                f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[
+                    f"layers.{layer_i}.feed_forward.w2.weight"
+                ],
+                f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[
+                    f"layers.{layer_i}.feed_forward.w3.weight"
+                ],
+                f"model.layers.{layer_i}.input_layernorm.weight": loaded[
+                    f"layers.{layer_i}.attention_norm.weight"
+                ],
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
+                    f"layers.{layer_i}.ffn_norm.weight"
+                ],
+            }
+            state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = (
+                inv_freq
+            )
+            state_dict = convert_state_dict_to_hf(state_dict)
+            for k, v in state_dict.items():
+                index_dict["weight_map"][k] = filename
+                param_count += v.numel()
+            torch.save(state_dict, os.path.join(tmp_model_path, filename))
+            print(f"Saved {filename}")
+
+        filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
+
+        state_dict = {
+            "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
+            "model.norm.weight": loaded["norm.weight"],
+            "model.lm_head.weight": (
+                loaded["output.weight"]
+                if not tie_word_embeddings
+                else loaded["tok_embeddings.weight"]
+            ),
+            "multi_modal_projector.projector.0.weight": loaded[
+                "vision_projector.projector.0.weight"
+            ],
+            "multi_modal_projector.projector.2.weight": loaded[
+                "vision_projector.projector.2.weight"
+            ],
+            "multi_modal_projector.projector.0.bias": loaded[
+                "vision_projector.projector.0.bias"
+            ],
+            "multi_modal_projector.projector.2.bias": loaded[
+                "vision_projector.projector.2.bias"
+            ],
+        }
+        state_dict = convert_state_dict_to_hf(state_dict)
+        for k, v in state_dict.items():
+            index_dict["weight_map"][k] = filename
+            param_count += v.numel()
+        torch.save(state_dict, os.path.join(tmp_model_path, filename))
+        print(f"Saved {filename}")
+        # Write configs
+        index_dict["metadata"] = {"total_size": param_count * 2}
+        write_json(
+            index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")
+        )
+        ffn_dim_multiplier = (
+            model_params["ffn_dim_multiplier"]
+            if "ffn_dim_multiplier" in model_params
+            else 1
+        )
+        multiple_of = (
+            model_params["multiple_of"] if "multiple_of" in model_params else 256
+        )
+
+        bos_token_id = 128000
+        eos_token_id = [128001, 128008, 128009]
+
+        use_scaled_rope = model_params["use_scaled_rope"]
+        if use_scaled_rope:
+            rope_scaling = {
+                "factor": model_params["rope_scale_factor"] * 1.0,
+                "low_freq_factor": model_params.get("low_freq_factor", 1.0) * 1.0,
+                "high_freq_factor": model_params.get("high_freq_factor", 4.0) * 1.0,
+                "original_max_position_embeddings": 8192,
+                "rope_type": "llama3",
+            }
+        else:
+            rope_scaling = None
+
+        text_config = LlamaConfig(
+            hidden_size=dim,
+            intermediate_size=compute_intermediate_size(
+                dim, ffn_dim_multiplier, multiple_of
+            ),
+            num_attention_heads=model_params["n_heads"],
+            num_hidden_layers=model_params["n_layers"],
+            rms_norm_eps=model_params["norm_eps"],
+            num_key_value_heads=num_key_value_heads,
+            vocab_size=vocab_size,
+            rope_theta=base,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+        )
+
+        vision_config = CLIPVisionConfig(
+            hidden_size=1024,
+            image_size=336,
+            intermediate_size=4096,
             num_attention_heads=16,
-            num_hidden_layers=26,
+            num_hidden_layers=24,
             patch_size=14,
-            vision_use_head=False,
-        ).to_dict()
-    else:
-        vision_config = None
-
-    config = PerceptionLMConfig(
-        text_config=text_config,
-        vision_config=vision_config,
+            projection_dim=768,
+            vocab_size=32000,
+        )
+
+        config = PerceptionLMConfig(
+            text_config=text_config.to_dict(),
+            vision_config=vision_config.to_dict(),
+            projector_pooling_ratio=projector_pooling_ratio,
+            image_token_index=image_token_id,
+        )
+
+        config.save_pretrained(tmp_model_path)
+
+        generation_config = GenerationConfig(
+            do_sample=False,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+        )
+        generation_config.save_pretrained(tmp_model_path)
+
+        # Make space so we can load the model properly now.
+        del state_dict
+        del loaded
+        gc.collect()
+
+        print("Loading the checkpoint in a PerceptionLM model.")
+        model = PerceptionLMForConditionalGeneration.from_pretrained(
+            tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
+        )
+
+        # Avoid saving this as part of the config.
+        del model.config._name_or_path
+        model.config.torch_dtype = torch.float16
+
+        print("Saving in the Transformers format.")
+        if push_to_hub:
+            print("Pushing to the hub.")
+            model.push_to_hub(
+                model_path,
+                safe_serialization=safe_serialization,
+                private=True,
+                use_temp_dir=True,
+            )
+        else:
+            print("Saving to disk.")
+            model.save_pretrained(model_path, safe_serialization=safe_serialization)
+
+
+class Llama3Converter(TikTokenConverter):
+    def __init__(
+        self,
+        vocab_file,
+        special_tokens=None,
+        context_length=11520,
+        **kwargs,
+    ):
+        super().__init__(vocab_file, additional_special_tokens=special_tokens, **kwargs)
+        tokenizer = self.converted()
+        model_id = "meta-llama/Llama-3.2-1B-Instruct"
+        revision = "e9f8effbab1cbdc515c11ee6e098e3d5a9f51e14"
+        from transformers import AutoTokenizer
+
+        t = AutoTokenizer.from_pretrained(model_id, revision=revision)
+        additional_kwargs = {"chat_template": t.chat_template}
+
+        self.converted_tokenizer = PreTrainedTokenizerFast(
+            tokenizer_object=tokenizer,
+            bos_token="<|begin_of_text|>",
+            eos_token="<|eot_id|>",
+            model_input_names=["input_ids", "attention_mask"],
+            model_max_length=context_length,
+            clean_up_tokenization_spaces=True,
+            **additional_kwargs,
+        )
+        self.update_post_processor(self.converted_tokenizer)
+        # finer special_tokens_map.json
+        self.converted_tokenizer._bos_token = BOS_ADDED_TOKEN
+        self.converted_tokenizer._eos_token = EOT_ADDED_TOKEN
+
+    # We can't do this while building the tokenizer because we have no easy access to the bos token id
+    def update_post_processor(self, tokenizer):
+        tokenizer._tokenizer.post_processor = processors.Sequence(
+            [
+                processors.ByteLevel(trim_offsets=False),
+                processors.TemplateProcessing(
+                    single="<|begin_of_text|> $A",
+                    pair="<|begin_of_text|>:0 $A:0 <|begin_of_text|>:1 $B:1",
+                    special_tokens=[
+                        (
+                            "<|begin_of_text|>",
+                            tokenizer.convert_tokens_to_ids("<|begin_of_text|>"),
+                        ),
+                    ],
+                ),
+            ]
+        )
+
+
+def write_tokenizer(
+    tokenizer_path,
+    input_tokenizer_path,
+    special_tokens=None,
+    context_length=11520,
+    push_to_hub=False,
+):
+    print("Converting the tokenizer.")
+    tokenizer_class = (
+        LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
     )
-
-    # llms-lab interleeave models do not use any selection startegy except for last hidden state
-    if "Qwen" in text_model_id:
-        config.image_token_id = 151646
-        if "siglip" in vision_model_id:
-            config.vision_feature_select_strategy = "full"
-            config.vision_feature_layer = -1
+    tokenizer = Llama3Converter(
+        input_tokenizer_path,
+        special_tokens,
+        context_length,
+    ).converted_tokenizer
+
+    if push_to_hub:
+        print(
+            f"Pushing a {tokenizer_class.__name__} to the Hub repo - {tokenizer_path}."
+        )
+        tokenizer.push_to_hub(tokenizer_path, private=True, use_temp_dir=True)
     else:
-        config.pad_token_id = 32001
-        config.image_token_id = 32000
-
-    with torch.device("meta"):
-        model = PerceptionLMForConditionalGeneration(config)
-
-    # Some perception_lm variants like microsoft/perception_lm-med-v1.5-mistral-7b use safetensors to store weights
-    if file_exists(old_state_dict_id, "model_state_dict.bin"):
-        state_dict_path = hf_hub_download(old_state_dict_id, "model_state_dict.bin")
-        state_dict = torch.load(state_dict_path, map_location="cpu", weights_only=True)
-    else:
-        state_dict = load_original_state_dict(old_state_dict_id)
-
-    state_dict = convert_state_dict_to_hf(state_dict)
-    model.load_state_dict(state_dict, strict=True, assign=True)
-
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model and pad to 64 for performance reasons
-    pad_shape = 64
-    vocab_size = config.text_config.vocab_size
-    model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)
-    model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
-        tuple(
-            (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]))
-        ),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
-        dim=0,
-    )
-
-    model.push_to_hub(output_hub_path)
-    processor.push_to_hub(output_hub_path)
+        print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
+        tokenizer.save_pretrained(tokenizer_path)
+    return tokenizer
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        epilog=EPILOG_TXT,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of Llama weights, which contains tokenizer.model and model folders",
     )
     parser.add_argument(
-        "--text_model_id",
-        help="Hub location of the text model",
+        "--output_dir",
+        help="Location to write HF model and tokenizer",
     )
     parser.add_argument(
-        "--vision_model_id",
-        help="Hub location of the vision model",
+        "--push_to_hub",
+        help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.",
+        action="store_true",
+        default=False,
     )
     parser.add_argument(
-        "--output_hub_path",
-        help="Location on the hub of the converted model",
+        "--safe_serialization",
+        action="store_true",
+        default=True,
+        help="Whether or not to save using `safetensors`.",
     )
     parser.add_argument(
-        "--old_state_dict_id",
-        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
+        "--num_shards",
+        default=None,
+        type=int,
+        help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth",
+    )
+    parser.add_argument(
+        "--special_tokens",
+        default=None,
+        type=List[str],
+        help="The list of special tokens that should be added to the model.",
     )
     args = parser.parse_args()
-    convert_perception_lm_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id)
+    if args.special_tokens is None:
+        # no special tokens by default
+        args.special_tokens = DEFAULT_SPECIAL_TOKENS.get("perception_lm", [])
+
+    params = read_json(os.path.join(args.input_dir, "params.json"))
+
+    spm_path = os.path.join(args.input_dir, "tokenizer.model")
+    tokenizer = write_tokenizer(
+        args.output_dir,
+        spm_path,
+        special_tokens=args.special_tokens,
+        context_length=params["model"]["max_seqlen"],
+        push_to_hub=args.push_to_hub,
+    )
+    vocab_size = len(tokenizer)
+    image_token_id = tokenizer.encode("<|image|>", add_special_tokens=False)[0]
+    write_model(
+        model_path=args.output_dir,
+        input_base_path=args.input_dir,
+        params=params,
+        image_token_id=image_token_id,
+        safe_serialization=args.safe_serialization,
+        vocab_size=vocab_size,
+        num_shards=args.num_shards,
+        push_to_hub=args.push_to_hub,
+    )
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 563ac0b33eb4..2b432553b06a 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -17,9 +17,12 @@
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
+import math
+
 import torch
 import torch.utils.checkpoint
 from torch import nn
+import torch.nn.functional as F
 
 from ...activations import ACT2FN
 from ...generation import GenerationMixin
@@ -85,27 +88,77 @@ class PerceptionLMCausalLMOutputWithPast(ModelOutput):
     image_hidden_states: Optional[torch.FloatTensor] = None
 
 
-# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->PerceptionLM
+# # Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->PerceptionLM
+# class PerceptionLMMultiModalProjector(nn.Module):
+#     def __init__(self, config: PerceptionLMConfig):
+#         super().__init__()
+#         # We have hidden_size * the number of vision feature layers
+#         num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
+#         self.linear_1 = nn.Linear(
+#             config.vision_config.hidden_size * num_feature_layers,
+#             config.text_config.hidden_size,
+#             bias=config.multimodal_projector_bias,
+#         )
+#         self.act = ACT2FN[config.projector_hidden_act]
+#         self.linear_2 = nn.Linear(
+#             config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+#         )
+
+#     def forward(self, image_features):
+#         hidden_states = self.linear_1(image_features)
+#         hidden_states = self.act(hidden_states)
+#         hidden_states = self.linear_2(hidden_states)
+#         return hidden_states
+
+
+class AdaptiveAvgPooling(nn.Module):
+    def __init__(self, pooling_ratio=2):
+        super(AdaptiveAvgPooling, self).__init__()
+        self.pooling_ratio = pooling_ratio
+
+    def forward(self, x):
+        b, num_tokens, c = x.shape
+        h = int(math.sqrt(num_tokens))
+        assert h * h == num_tokens
+
+        shape = (h // self.pooling_ratio, h // self.pooling_ratio)
+        x = x.permute(0, 2, 1).reshape(b, -1, h, h)
+        x = F.adaptive_avg_pool2d(x, shape)
+        x = x.flatten(2).transpose(1, 2)
+
+        return x
+
+
 class PerceptionLMMultiModalProjector(nn.Module):
     def __init__(self, config: PerceptionLMConfig):
         super().__init__()
-        # We have hidden_size * the number of vision feature layers
-        num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
-        self.linear_1 = nn.Linear(
-            config.vision_config.hidden_size * num_feature_layers,
-            config.text_config.hidden_size,
-            bias=config.multimodal_projector_bias,
+        input_size = config.vision_config.hidden_size
+        output_size = config.text_config.hidden_size
+        self.projector = nn.Sequential(
+            nn.Linear(
+                in_features=input_size,
+                out_features=output_size,
+                bias=True,
+            ),
+            nn.GELU(),
+            nn.Linear(
+                in_features=output_size,
+                out_features=output_size,
+                bias=True,
+            ),
         )
-        self.act = ACT2FN[config.projector_hidden_act]
-        self.linear_2 = nn.Linear(
-            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        self.pooling = (
+            AdaptiveAvgPooling(config.projector_pooling_ratio)
+            if config.projector_pooling_ratio > 1
+            else nn.Identity()
         )
 
-    def forward(self, image_features):
-        hidden_states = self.linear_1(image_features)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
-        return hidden_states
+    def forward(self, x):
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.projector(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.pooling(x)
+        return x
 
 
 PERCEPTION_LM_START_DOCSTRING = r"""
@@ -146,7 +199,11 @@ def _init_weights(self, module):
         # important: this ported version of PerceptionLM isn't meant for training from scratch - only
         # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
         # https://github.com/haotian-liu/LLaVA/tree/main/perception_lm should serve for that purpose
-        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)
+        std = getattr(
+            self.config,
+            "initializer_range",
+            self.config.get_text_config().initializer_range,
+        )
 
         if isinstance(module, nn.Linear):
             module.weight.data.normal_(mean=0.0, std=std)
@@ -236,7 +293,9 @@ def _init_weights(self, module):
     PERCEPTION_LM_START_DOCSTRING,
 )
 # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration with LLAVA->PERCEPTION_LM,Llava->PerceptionLM,llava-hf/llava-1.5-7b-hf->facebook/Perception-LM-1B
-class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin):
+class PerceptionLMForConditionalGeneration(
+    PerceptionLMPreTrainedModel, GenerationMixin
+):
     def __init__(self, config: PerceptionLMConfig):
         super().__init__(config)
         self.vision_tower = AutoModel.from_config(config.vision_config)
@@ -246,9 +305,13 @@ def __init__(self, config: PerceptionLMConfig):
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
 
         if self.language_model._tied_weights_keys is not None:
-            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+            self._tied_weights_keys = [
+                f"language_model.{k}" for k in self.language_model._tied_weights_keys
+            ]
 
-        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.pad_token_id = (
+            self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        )
 
         self.post_init()
 
@@ -294,11 +357,15 @@ def get_image_features(
             image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
         """
         if vision_feature_select_strategy not in ["default", "full"]:
-            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+            raise ValueError(
+                f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
+            )
 
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
         # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
-        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True, **kwargs)
+        image_outputs = self.vision_tower(
+            pixel_values, output_hidden_states=True, **kwargs
+        )
 
         # If we have one vision feature layer, return the corresponding hidden states,
         # otherwise, select the hidden states of each feature layer and concatenate them
@@ -307,17 +374,24 @@ def get_image_features(
             if vision_feature_select_strategy == "default":
                 selected_image_feature = selected_image_feature[:, 1:]
         else:
-            hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
+            hs_pool = [
+                image_outputs.hidden_states[layer_idx]
+                for layer_idx in vision_feature_layer
+            ]
             # For default; crop CLS from each hidden state in the hidden state pool
             if vision_feature_select_strategy == "default":
                 hs_pool = [hs[:, 1:] for hs in hs_pool]
             selected_image_feature = torch.cat(hs_pool, dim=-1)
-
-        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = torch.load(
+            "/checkpoint/vision_encoder/smhu/debug/0/h_img_dump_0.pt"
+        ).to(selected_image_feature)
+        image_features = self.multi_modal_projector(image_features)
         return image_features
 
     @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(
+        output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+    )
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -376,13 +450,23 @@ def forward(
         "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
         ```"""
 
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         vision_feature_layer = (
-            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+            vision_feature_layer
+            if vision_feature_layer is not None
+            else self.config.vision_feature_layer
         )
         vision_feature_select_strategy = (
             vision_feature_select_strategy
@@ -391,7 +475,9 @@ def forward(
         )
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
 
         if pixel_values is not None and inputs_embeds is not None:
             raise ValueError(
@@ -410,15 +496,27 @@ def forward(
             )
 
             special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
-            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
-            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
-                n_image_tokens = (input_ids == self.config.image_token_id).sum()
-                n_image_features = image_features.shape[0] * image_features.shape[1]
-                raise ValueError(
-                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                )
-            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
+                inputs_embeds.device
+            )
+            # if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            #     n_image_tokens = (input_ids == self.config.image_token_id).sum()
+            #     n_image_features = image_features.shape[0] * image_features.shape[1]
+            #     raise ValueError(
+            #         f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            #     )
+            image_features = image_features.to(
+                inputs_embeds.device, inputs_embeds.dtype
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(
+                special_image_mask, image_features
+            )
+            # print(inputs_embeds.shape)
+            # occhi_embeds = torch.load(
+            #     "/checkpoint/vision_encoder/smhu/debug/0/h_post_stitching_dump_0.pt"
+            # )
+            # print(occhi_embeds.shape)
+            # inputs_embeds = occhi_embeds
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -442,16 +540,23 @@ def forward(
             if attention_mask is not None:
                 # we use the input attention mask to shift the logits and labels, because it is 2D.
                 # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
-                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
-                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
-                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(
+                    logits.device
+                )
+                shift_logits = logits[..., :-1, :][
+                    shift_attention_mask.to(logits.device) != 0
+                ].contiguous()
+                shift_labels = labels[..., 1:][
+                    shift_attention_mask.to(labels.device) != 0
+                ].contiguous()
             else:
                 shift_logits = logits[..., :-1, :].contiguous()
                 shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = nn.CrossEntropyLoss()
             loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1).to(shift_logits.device),
             )
 
         if not return_dict:

From c18171e3b41494360cf96288e95c22641c2c3234 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 29 Apr 2025 22:09:25 +0000
Subject: [PATCH 03/65] hacked processor

---
 .../models/auto/image_processing_auto.py      |   5 +-
 .../convert_perception_lm_weights_to_hf.py    |  72 ++-
 .../image_processing_perception_lm.py         | 435 -----------------
 .../image_processing_perception_lm_fast.py    | 123 ++---
 .../models/perception_lm/image_transform.py   | 441 ++++++++++++++++++
 .../perception_lm/modeling_perception_lm.py   |  42 +-
 .../perception_lm/processing_perception_lm.py |  18 +-
 test.py                                       |  48 ++
 8 files changed, 611 insertions(+), 573 deletions(-)
 delete mode 100644 src/transformers/models/perception_lm/image_processing_perception_lm.py
 create mode 100644 src/transformers/models/perception_lm/image_transform.py
 create mode 100644 test.py

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 64666456075e..c1839598f417 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -176,6 +176,7 @@
             ("xclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("yolos", ("YolosImageProcessor", "YolosImageProcessorFast")),
             ("zoedepth", ("ZoeDepthImageProcessor", "ZoeDepthImageProcessorFast")),
+            ("perception_lm", ("PerceptionLMImageProcessorFast",)),
         ]
     )
 
@@ -477,6 +478,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 raise initial_exception
 
         image_processor_type = config_dict.get("image_processor_type", None)
+        print("image_processor_type", image_processor_type)
         image_processor_auto_map = None
         if "AutoImageProcessor" in config_dict.get("auto_map", {}):
             image_processor_auto_map = config_dict["auto_map"]["AutoImageProcessor"]
@@ -597,7 +599,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                     raise ValueError(
                         "This image processor cannot be instantiated. Please make sure you have `Pillow` installed."
                     )
-
+        print("config type", type(config))
+        print("config", config)
         raise ValueError(
             f"Unrecognized image processor in {pretrained_model_name_or_path}. Should have a "
             f"`image_processor_type` key in its {IMAGE_PROCESSOR_NAME} of {CONFIG_NAME}, or one of the following "
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 57634be620bd..769fa3dcd6e3 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -35,10 +35,11 @@
 from transformers.models.perception_lm.configuration_perception_lm import (
     PerceptionLMConfig,
 )
+from transformers.models.perception_lm.image_processing_perception_lm_fast import PerceptionLMImageProcessorFast
 from transformers.models.perception_lm.modeling_perception_lm import (
     PerceptionLMForConditionalGeneration,
 )
-
+from transformers.models.perception_lm.processing_perception_lm import PerceptionLMProcessor
 
 
 try:
@@ -426,12 +427,33 @@ def __init__(
     ):
         super().__init__(vocab_file, additional_special_tokens=special_tokens, **kwargs)
         tokenizer = self.converted()
-        model_id = "meta-llama/Llama-3.2-1B-Instruct"
-        revision = "e9f8effbab1cbdc515c11ee6e098e3d5a9f51e14"
-        from transformers import AutoTokenizer
 
-        t = AutoTokenizer.from_pretrained(model_id, revision=revision)
-        additional_kwargs = {"chat_template": t.chat_template}
+        chat_template = (
+            "{{- bos_token }}"
+            "{%- if messages[0]['role'] == 'system' -%}"
+            "    {%- set system_message = messages[0]['content']|trim %}\n"
+            "    {%- set messages = messages[1:] %}\n"
+            "{%- else %}"
+            "    {%- set system_message = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.' %}"
+            "{%- endif %}"
+            "{{- '<|start_header_id|>system<|end_header_id|>\\n\\n' }}"
+            "{{- system_message }}"
+            "{{- '<|eot_id|>' }}"
+            "{%- for message in messages %}"
+            "{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' }}"
+            "{%- for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
+            "{{ '<|image|>' }}"
+            "{%- endfor %}"
+            "{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
+            "{{- content['text'] | trim }}"
+            "{%- endfor %}"
+            "{{'<|eot_id|>' }}"
+            "{%- endfor %}"
+            "{%- if add_generation_prompt %}"
+            "{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}"
+            "{%- endif %}"
+        )
+        additional_kwargs = {"chat_template": chat_template}
 
         self.converted_tokenizer = PreTrainedTokenizerFast(
             tokenizer_object=tokenizer,
@@ -470,27 +492,54 @@ def write_tokenizer(
     tokenizer_path,
     input_tokenizer_path,
     special_tokens=None,
-    context_length=11520,
+    params=None,
     push_to_hub=False,
 ):
     print("Converting the tokenizer.")
     tokenizer_class = (
         LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
     )
+    context_length = params["model"]["max_seqlen"]
     tokenizer = Llama3Converter(
         input_tokenizer_path,
         special_tokens,
         context_length,
     ).converted_tokenizer
+    tokenizer.image_token = "<|image|>"
+    tokenizer.image_token_id = tokenizer.encode("<|image|>", add_special_tokens=False)[
+        0
+    ]
+
+    processor_config = {
+        "image_token": "<|image|>",
+        "pooling_ratio": params["model"]["pooling_ratio"],
+        "patch_size": params["model"]["vision_model"]["patch_size"],
+        "processor_class": "PerceptionLMProcessor",
+    }
+
+    preprocessor_config = {
+        "image_processor_type": "PerceptionLMImageProcessorFast",
+        "vision_input_type": params["data"]["vision_input_type"],
+        "image_res": params["model"]["vision_model"]["image_size"],
+        "max_num_tiles": params["data"]["max_num_tiles"],
+        "normalize_img": True,
+    }
+
+    image_preprocessor = PerceptionLMImageProcessorFast(**preprocessor_config)
+    processor = PerceptionLMProcessor(
+        image_processor=image_preprocessor,
+        tokenizer=tokenizer,
+        **processor_config,
+    )
 
     if push_to_hub:
         print(
             f"Pushing a {tokenizer_class.__name__} to the Hub repo - {tokenizer_path}."
         )
-        tokenizer.push_to_hub(tokenizer_path, private=True, use_temp_dir=True)
+        processor.push_to_hub(tokenizer_path, private=True, use_temp_dir=True)
     else:
         print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
-        tokenizer.save_pretrained(tokenizer_path)
+        processor.save_pretrained(tokenizer_path)
     return tokenizer
 
 
@@ -540,16 +589,15 @@ def main():
         args.output_dir,
         spm_path,
         special_tokens=args.special_tokens,
-        context_length=params["model"]["max_seqlen"],
+        params=params,
         push_to_hub=args.push_to_hub,
     )
     vocab_size = len(tokenizer)
-    image_token_id = tokenizer.encode("<|image|>", add_special_tokens=False)[0]
     write_model(
         model_path=args.output_dir,
         input_base_path=args.input_dir,
         params=params,
-        image_token_id=image_token_id,
+        image_token_id=tokenizer.image_token_id,
         safe_serialization=args.safe_serialization,
         vocab_size=vocab_size,
         num_shards=args.num_shards,
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm.py b/src/transformers/models/perception_lm/image_processing_perception_lm.py
deleted file mode 100644
index c36d4f1b6c9d..000000000000
--- a/src/transformers/models/perception_lm/image_processing_perception_lm.py
+++ /dev/null
@@ -1,435 +0,0 @@
-# coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Image processor class for PerceptionLM."""
-
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy as np
-
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import (
-    convert_to_rgb,
-    get_resize_output_image_size,
-    resize,
-    to_channel_dimension_format,
-)
-from ...image_utils import (
-    OPENAI_CLIP_MEAN,
-    OPENAI_CLIP_STD,
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    is_scaled_image,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-    validate_kwargs,
-    validate_preprocess_arguments,
-)
-from ...utils import TensorType, is_vision_available, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-if is_vision_available():
-    import PIL
-
-
-class PerceptionLMImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a PerceptionLM image processor.
-
-    Args:
-        do_pad (`bool`, *optional*, defaults to `False`):
-            Whether to pad the image to a square based on the longest edge.
-            The padding value is determined by the `image_mean` parameter.
-            Can be overridden by `do_pad` in the `preprocess` method.
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
-            `do_resize` in the `preprocess` method.
-        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
-            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
-            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
-            method.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
-        do_center_crop (`bool`, *optional*, defaults to `True`):
-            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
-            `preprocess` method.
-        crop_size (`Dict[str, int]` *optional*, defaults to 224):
-            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
-            method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
-            the `preprocess` method.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
-            method.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
-            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
-            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
-            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
-            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-            Can be overridden by the `image_std` parameter in the `preprocess` method.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
-            Whether to convert the image to RGB.
-    """
-
-    model_input_names = ["pixel_values"]
-
-    def __init__(
-        self,
-        do_pad: bool = False,
-        do_resize: bool = True,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        do_center_crop: bool = True,
-        crop_size: Dict[str, int] = None,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = True,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        size = size if size is not None else {"shortest_edge": 224}
-        size = get_size_dict(size, default_to_square=False)
-        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
-        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
-
-        self.do_pad = do_pad
-        self.do_resize = do_resize
-        self.size = size
-        self.resample = resample
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
-        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
-        self.do_convert_rgb = do_convert_rgb
-        self._valid_processor_keys = [
-            "images",
-            "do_pad",
-            "do_resize",
-            "size",
-            "resample",
-            "do_center_crop",
-            "crop_size",
-            "do_rescale",
-            "rescale_factor",
-            "do_normalize",
-            "image_mean",
-            "image_std",
-            "do_convert_rgb",
-            "return_tensors",
-            "data_format",
-            "input_data_format",
-        ]
-
-    def pad_to_square(
-        self,
-        image: np.ndarray,
-        background_color: Union[int, Tuple[int, int, int]] = 0,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> np.array:
-        """
-        Pads an image to a square based on the longest edge.
-
-        Args:
-            image (`np.ndarray`):
-                The image to pad.
-            background_color (`int` or `Tuple[int, int, int]`, *optional*, defaults to 0):
-                The color to use for the padding. Can be an integer for single channel or a
-                tuple of integers representing for multi-channel images. If passed as integer
-                in mutli-channel mode, it will default to `0` in subsequent channels.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. Can be one of:
-                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                If unset, will use same as the input image.
-            input_data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the input image. Can be one of:
-                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                If unset, will use the inferred format of the input image.
-
-        Returns:
-            `np.ndarray`: The padded image.
-        """
-        height, width = get_image_size(image, input_data_format)
-        num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
-
-        if height == width:
-            image = (
-                to_channel_dimension_format(image, data_format, input_data_format)
-                if data_format is not None
-                else image
-            )
-            return image
-
-        max_dim = max(height, width)
-
-        # Ensure background_color is the correct shape
-        if isinstance(background_color, int):
-            background_color = [background_color]
-        elif len(background_color) != num_channels:
-            raise ValueError(
-                f"background_color must have no more than {num_channels} elements to match the number of channels"
-            )
-
-        if input_data_format == ChannelDimension.FIRST:
-            result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
-            for i, color in enumerate(background_color):
-                result[i, :, :] = color
-            if width > height:
-                start = (max_dim - height) // 2
-                result[:, start : start + height, :] = image
-            else:
-                start = (max_dim - width) // 2
-                result[:, :, start : start + width] = image
-        else:
-            result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
-            for i, color in enumerate(background_color):
-                result[:, :, i] = color
-            if width > height:
-                start = (max_dim - height) // 2
-                result[start : start + height, :, :] = image
-            else:
-                start = (max_dim - width) // 2
-                result[:, start : start + width, :] = image
-
-        image = (
-            to_channel_dimension_format(result, data_format, input_data_format) if data_format is not None else result
-        )
-        return image
-
-    def resize(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
-        resized to keep the input aspect ratio.
-
-        Args:
-            image (`np.ndarray`):
-                Image to resize.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
-                Resampling filter to use when resiizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format of the input image. If not provided, it will be inferred.
-        """
-        default_to_square = True
-        if "shortest_edge" in size:
-            size = size["shortest_edge"]
-            default_to_square = False
-        elif "height" in size and "width" in size:
-            size = (size["height"], size["width"])
-        else:
-            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
-
-        output_size = get_resize_output_image_size(
-            image,
-            size=size,
-            default_to_square=default_to_square,
-            input_data_format=input_data_format,
-        )
-        return resize(
-            image,
-            size=output_size,
-            resample=resample,
-            data_format=data_format,
-            input_data_format=input_data_format,
-            **kwargs,
-        )
-
-    def preprocess(
-        self,
-        images: ImageInput,
-        do_pad: Optional[bool] = None,
-        do_resize: Optional[bool] = None,
-        size: Optional[Dict[str, int]] = None,
-        resample: Optional[PILImageResampling] = None,
-        do_center_crop: Optional[bool] = None,
-        crop_size: Optional[int] = None,
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[float] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> PIL.Image.Image:
-        """
-        Preprocess an image or batch of images.
-
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
-                Whether to pad the image to a square based on the longest edge.
-                The padding value is determined by the `image_mean` parameter.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
-                the longest edge resized to keep the input aspect ratio.
-            resample (`int`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
-                has an effect if `do_resize` is set to `True`.
-            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
-                Whether to center crop the image.
-            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
-                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
-                `True`.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                - Unset: Return a list of `np.ndarray`.
-                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        """
-        do_pad = do_pad if do_pad is not None else self.do_pad
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        size = get_size_dict(size, param_name="size", default_to_square=False)
-        resample = resample if resample is not None else self.resample
-        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
-        crop_size = crop_size if crop_size is not None else self.crop_size
-        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
-        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
-
-        images = make_list_of_images(images)
-
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-        # we don't pass `do_pad` here since PerceptionLM uses a custom padding to a square
-        validate_preprocess_arguments(
-            do_rescale=do_rescale,
-            rescale_factor=rescale_factor,
-            do_normalize=do_normalize,
-            image_mean=image_mean,
-            image_std=image_std,
-            do_center_crop=do_center_crop,
-            crop_size=crop_size,
-            do_resize=do_resize,
-            size=size,
-            resample=resample,
-        )
-
-        if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if is_scaled_image(images[0]) and do_rescale:
-            logger.warning_once(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        processed_images = []
-        for image in images:
-            if do_pad:
-                image = self.pad_to_square(
-                    image=image,
-                    background_color=tuple(int(x * 255) for x in self.image_mean),
-                    input_data_format=input_data_format,
-                )
-
-            if do_resize:
-                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
-
-            if do_center_crop:
-                image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
-
-            if do_rescale:
-                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-
-            if do_normalize:
-                image = self.normalize(
-                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
-                )
-
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
-            processed_images.append(image)
-
-        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
-
-
-__all__ = ["PerceptionLMImageProcessor"]
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index ed70ab96005c..2787cd1e8a7d 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -16,6 +16,8 @@
 
 from typing import List, Optional, Tuple, Union
 
+from transformers.models.perception_lm.image_transform import get_image_transform
+
 from ...image_processing_utils import (
     BatchFeature,
 )
@@ -28,8 +30,8 @@
     reorder_images,
 )
 from ...image_utils import (
-    OPENAI_CLIP_MEAN,
-    OPENAI_CLIP_STD,
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -61,7 +63,11 @@
 
 
 class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    do_pad: Optional[bool]
+    vision_input_type: str = "thumb+tile"
+    image_res: int = 448
+    max_num_tiles: int = 36
+    normalize_img: bool = True
+    return_tensors: Optional[Union[str, TensorType]] = None
 
 
 @add_start_docstrings(
@@ -73,22 +79,21 @@ class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     """,
 )
 class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
-    resample = PILImageResampling.BICUBIC
-    image_mean = OPENAI_CLIP_MEAN
-    image_std = OPENAI_CLIP_STD
-    size = {"shortest_edge": 224}
-    default_to_square = False
-    crop_size = {"height": 224, "width": 224}
-    do_pad = False
-    do_resize = True
-    do_center_crop = True
-    do_rescale = True
-    do_normalize = True
-    do_convert_rgb = True
     valid_kwargs = PerceptionLMFastImageProcessorKwargs
 
     def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> None:
         super().__init__(**kwargs)
+        self.image_transform = get_image_transform(
+            vision_input_type=kwargs.get("vision_input_type", "thumb+tile"),
+            image_res=kwargs.get("image_res", 448),
+            max_num_tiles=kwargs.get("max_num_tiles", 36),
+            normalize_img=kwargs.get("normalize_img", True),
+        )
+
+    def to_dict(self):
+        dictionary = super().to_dict()
+        dictionary["image_transform"] = self.image_transform.to_dict()
+        return dictionary
 
     @add_start_docstrings(
         BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
@@ -100,98 +105,22 @@ def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> No
     def preprocess(self, images: ImageInput, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
-    def pad_to_square(
-        self,
-        images: "torch.Tensor",
-        background_color: Union[int, Tuple[int, int, int]] = 0,
-    ) -> "torch.Tensor":
-        """
-        Pads an image to a square based on the longest edge.
-
-        Args:
-            images (`np.ndarray`):
-                The images to pad.
-            background_color (`int` or `Tuple[int, int, int]`, *optional*, defaults to 0):
-                The color to use for the padding. Can be an integer for single channel or a
-                tuple of integers representing for multi-channel images. If passed as integer
-                in mutli-channel mode, it will default to `0` in subsequent channels.
-        Returns:
-            `torch.Tensor`: The padded images.
-        """
-        height, width = get_image_size(images, ChannelDimension.FIRST)
-
-        if height == width:
-            return images
-
-        num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0]
-        if isinstance(background_color, int):
-            background_color = [background_color] + [0] * (num_channels - 1)
-        elif len(background_color) != num_channels:
-            raise ValueError(
-                f"background_color must have no more than {num_channels} elements to match the number of channels"
-            )
-
-        max_dim = max(height, width)
-        paste_x_left = (max_dim - width) // 2
-        paste_y_left = (max_dim - height) // 2
-        paste_x_right = max_dim - width - paste_x_left
-        paste_y_right = max_dim - height - paste_y_left
-        padded_images = F.pad(
-            images, padding=[paste_x_left, paste_y_left, paste_x_right, paste_y_right], fill=background_color
-        )
-
-        return padded_images
 
     def _preprocess(
         self,
         images: List["torch.Tensor"],
-        do_resize: bool,
-        size: SizeDict,
-        interpolation: Optional["F.InterpolationMode"],
-        do_pad: bool,
-        do_center_crop: bool,
-        crop_size: SizeDict,
-        do_rescale: bool,
-        rescale_factor: float,
-        do_normalize: bool,
-        image_mean: Optional[Union[float, List[float]]],
-        image_std: Optional[Union[float, List[float]]],
         return_tensors: Optional[Union[str, TensorType]],
+        **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]
     ) -> BatchFeature:
-        # Group images by size for batched resizing
+        # Group images by size for batched transformation
+        del kwargs
         grouped_images, grouped_images_index = group_images_by_shape(images)
-        resized_images_grouped = {}
-        for shape, stacked_images in grouped_images.items():
-            if do_pad:
-                stacked_images = self.pad_to_square(
-                    images=stacked_images, background_color=tuple(int(x * 255) for x in self.image_mean)
-                )
-            resized_images_grouped[shape] = stacked_images
-        padded_images = reorder_images(resized_images_grouped, grouped_images_index)
-
-        # Group images by size for batched resizing
-        # Needed in case do_pad is False, or padding returns images with different sizes
-        grouped_images, grouped_images_index = group_images_by_shape(padded_images)
-        resized_images_grouped = {}
-        for shape, stacked_images in grouped_images.items():
-            if do_resize:
-                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
-            resized_images_grouped[shape] = stacked_images
-        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
-
-        # Group images by size for further processing
-        # Needed in case do_resize is False, or resize returns images with different sizes
-        grouped_images, grouped_images_index = group_images_by_shape(resized_images)
         processed_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
-            if do_center_crop:
-                stacked_images = self.center_crop(stacked_images, crop_size)
-            # Fused rescale and normalize
-            stacked_images = self.rescale_and_normalize(
-                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
-            )
+            print("shape", shape)
+            stacked_images, _ = self.image_transform(stacked_images)
+            print("stacked_images shape", stacked_images.shape)
             processed_images_grouped[shape] = stacked_images
-
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
         processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
 
diff --git a/src/transformers/models/perception_lm/image_transform.py b/src/transformers/models/perception_lm/image_transform.py
new file mode 100644
index 000000000000..d6b6d31ef0b5
--- /dev/null
+++ b/src/transformers/models/perception_lm/image_transform.py
@@ -0,0 +1,441 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+import math
+from functools import reduce
+from logging import getLogger
+from typing import Any, Callable, Tuple, Union
+
+import numpy as np
+import torch
+import torchvision.transforms as tv
+from PIL import Image
+from torchvision.transforms import functional as F
+from torchvision.transforms.functional import InterpolationMode
+
+logger = getLogger()
+
+
+MEAN = (0.5, 0.5, 0.5)
+STD = (0.5, 0.5, 0.5)
+
+
+def get_image_transform(
+    vision_input_type: str = "vanilla",
+    image_res: int = 336,
+    max_num_tiles: int = 1,
+    normalize_img: bool = True,
+) -> Tuple[Callable, int]:
+
+    if vision_input_type == "thumb+tile":
+        transforms = VariableSizeImageTransform(
+            size=image_res,
+            max_num_tiles=max_num_tiles,
+            normalize_img=normalize_img,
+            use_thumbnail="before",
+        )
+    else:
+        transforms = ImageTransform(
+            size=image_res,
+            normalize_img=normalize_img,
+        )
+
+    logger.info(
+        f"Initalized transforms with: vision_input_type: '{vision_input_type}' and max_num_tiles: {max_num_tiles}."
+    )
+
+    return transforms
+
+
+class ImageTransform(object):
+    """
+    Image transform will resize the longer edge to a given size and pad the shorter edge with mean pixel value of the image.
+    """
+
+    def __init__(
+        self,
+        size: int = 336,
+        normalize_img: bool = True,
+    ) -> None:
+        self.size = size
+        self._mean = MEAN
+        self._std = STD
+        self.normalize_img = normalize_img
+
+        logger.info(f"ImageTransform size: {self.size}")
+
+        self.to_tensor = tv.ToTensor()
+        self.normalize = (
+            tv.Normalize(
+                mean=self._mean,
+                std=self._std,
+                inplace=True,
+            )
+            if normalize_img
+            else lambda x: x
+        )
+
+    def to_dict(self):
+        return {
+            "size": self.size,
+            "normalize_img": self.normalize_img,
+        }
+
+    def __call__(self, image: Union[Image.Image, torch.Tensor]):
+        if isinstance(image, Image.Image):
+            w, h = image.size
+        else:
+            w, h = image.shape[-2:]
+
+        image = F.resize(
+            image,
+            (self.size, self.size),
+            interpolation=InterpolationMode.BICUBIC,
+        )
+        if isinstance(image, Image.Image):
+            image = self.to_tensor(image)
+        else:
+            image = F.convert_image_dtype(image, torch.float32)
+        image = self.normalize(image)
+
+        # Add chunk dim to make it compatible with existing dataloaders
+        image = image.view(-1, 1, 3, self.size, self.size)
+        return image, (w, h)
+
+
+class VariableSizeImageTransform(object):
+    """
+    The variable size image transform will resize the image dynamically
+    based on the image aspect ratio and the number of image chunks we allow.
+
+    The algorithm will not upsample low-res images to fit a certain aspect
+    ratio, because that leads to a significant degradation in image quality.
+
+    For example, if an input image is of size 300x800, and we want to allow
+    a maximum of 16 image chunks, it will find the closest aspect ratio that
+    is allowed within 16 image chunks, i.e., 2:5 = 2 horizontal patches and
+    5 vertical patches, giving a total of 10 chunks.
+
+    The image will then be resized to products of the base size (default is
+    224px because MetaCLIP takes that), so in this case it will  be resized to
+    2*224:5*224 = 448:1120, where we maintain the original aspect ratio and
+    pad with the mean value for the rest. This approach minimizes the amount
+    of padding required for any arbitrary resolution.
+
+    The final output will therefore be of shape (11, 3, 224, 224), where 10
+    patches are coming from the resizing and chunking, and the first patch
+    is a downsampled version of the image that preserves aspect ratios.
+    """
+
+    def __init__(
+        self,
+        size: int = 336,
+        normalize_img: bool = True,
+        max_num_tiles: int = 1,
+        use_thumbnail: str = "no",
+        area_limit: bool = False,
+    ) -> None:
+        self.size = size
+        self._mean = MEAN
+        self._std = STD
+
+        logger.info(f"VariableSizeImageTransform size: {self.size}")
+
+        self.to_tensor = tv.ToTensor()
+        self.normalize = (
+            tv.Normalize(
+                mean=self._mean,
+                std=self._std,
+                inplace=True,
+            )
+            if normalize_img
+            else lambda x: x
+        )
+        self.area_limit = area_limit
+        self.max_num_tiles = max_num_tiles
+        self.use_thumbnail = use_thumbnail
+        self.normalize_img = normalize_img
+        if self.use_thumbnail != "no":
+            self.thumbnail_transform = ImageTransform(
+                size=self.size,
+                normalize_img=normalize_img,
+            )
+
+    def to_dict(self):
+        return {
+            "size": self.size,
+            "normalize_img": self.normalize_img,
+            "max_num_tiles": self.max_num_tiles,
+            "use_thumbnail": self.use_thumbnail,
+        }
+
+    @staticmethod
+    def _factors(n: int):
+        """Return all factors of a number."""
+        return set(
+            reduce(
+                list.__add__,
+                ([i, n // i] for i in range(1, int(n**0.5) + 1) if n % i == 0),
+            )
+        )
+
+    def _find_supported_aspect_ratios(self):
+        """
+        This function computes all the allowed aspect ratios for a fixed
+        number of input chunks.
+
+        For example, with `num_tiles=5`, it will return:
+        {
+            0.2: [(1, 5)],
+            5.0: [(5, 1)],
+            0.25: [(1, 4)],
+            1.0: [(2, 2), (1, 1)],
+            4.0: [(4, 1)],
+            0.3333333333333333: [(1, 3)],
+            3.0: [(3, 1)],
+            0.5: [(1, 2)],
+            2.0: [(2, 1)]
+        }
+        """
+        asp_dict = {}
+        for chunk_size in range(self.max_num_tiles, 0, -1):
+            _factors = sorted(VariableSizeImageTransform._factors(chunk_size))
+            _asp_ratios = [(x, chunk_size // x) for x in _factors]
+            for ratio in _asp_ratios:
+                k = ratio[0] / ratio[1]
+                if k not in asp_dict:
+                    asp_dict[k] = [ratio]
+                else:
+                    asp_dict[k].append(ratio)
+        return asp_dict
+
+    def _find_closest_aspect_ratio(self, img_width: int, img_height: int) -> Tuple:
+        """
+        Given an image width, height and target number of chunks
+        this function will find the closest supported aspect ratio.
+        """
+        tgt_ar = img_width / img_height
+        asp_dict = self._find_supported_aspect_ratios()
+        cl_d, cl_p = 1e23, None
+        if tgt_ar >= 1:
+            cl_p = min(
+                [k for k in asp_dict.keys() if k <= tgt_ar],
+                key=lambda x: abs(x - tgt_ar),
+            )
+            v = asp_dict[cl_p]
+            # select width
+            widths = [(idx, self.size * vv[0]) for idx, vv in enumerate(v)]
+            tgt_idx = max(widths, key=lambda x: x[1])[0]
+        else:
+            cl_p = min(
+                [k for k in asp_dict.keys() if k > tgt_ar],
+                key=lambda x: abs(1 / x - 1 / tgt_ar),
+            )
+            v = asp_dict[cl_p]
+            # select height
+            heights = [(idx, self.size * vv[1]) for idx, vv in enumerate(v)]
+            tgt_idx = max(heights, key=lambda x: x[1])[0]
+        out = v[tgt_idx]
+        return out
+
+    def _resize(
+        self, image: Image.Image, target_width: int, target_height: int
+    ) -> Image.Image:
+        # Resize longer edge to given size.
+        w, h = image.size
+        scale = w / h
+
+        if scale > 1.0:
+            # width > height
+            new_w = target_width
+            new_h = math.floor(new_w / scale)
+        else:
+            # height >= width
+            new_h = target_height
+            new_w = math.floor(new_h * scale)
+
+        image = F.resize(image, (new_h, new_w))
+        return image
+
+    def _pad(
+        self, image: Union[Image.Image, torch.Tensor], new_width: int, new_height: int
+    ) -> Union[Image.Image, torch.Tensor]:
+        if isinstance(image, Image.Image):
+            new_im = Image.new(mode="RGB", size=(new_width, new_height), color=(0, 0, 0))  # type: ignore
+            new_im.paste(image)
+            return new_im
+        else:
+            return F.pad(
+                image, (0, 0, new_width - image.shape[-1], new_height - image.shape[-2])
+            )
+
+    def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
+        # Split image into number of required tiles (width x height)
+        batch_size, num_channels, height, width = image.size()
+        image = image.view(
+            batch_size, num_channels, nch, height // nch, ncw, width // ncw
+        )
+        # Permute dimensions to reorder the axes
+        image = image.permute(0, 2, 4, 1, 3, 5).contiguous()
+        # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
+        image = image.view(
+            batch_size, ncw * nch, num_channels, height // nch, width // ncw
+        )
+        return image
+
+    def _get_image_height_width(
+        self, image_width: int, image_height: int, target_width: int, target_height: int
+    ) -> Tuple[int, int]:
+        """
+        Given image width, height and target width, height for the canvas, return the dimensions of how the image would be resized
+        with aspect ratio preservation.
+        """
+        scale = image_width / image_height
+
+        if scale > 1.0:
+            # Width is larger than height
+
+            # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
+            rescaling_factor = min(
+                target_width / image_width, target_height / image_height
+            )
+
+            # Set new width to target width and height to the rescaled height.
+            new_w = rescaling_factor * image_width
+            new_h = math.floor(new_w / scale)
+
+        else:
+            # Height is larger than width
+
+            # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
+            rescaling_factor = min(
+                target_width / image_width, target_height / image_height
+            )
+
+            # Set new height to target height and width to the rescaled width.
+            new_h = rescaling_factor * image_height
+            new_w = math.floor(new_h * scale)
+
+        return new_w, new_h
+
+    def _fit_image_to_canvas(
+        self, img_width: int, img_height: int, area_limit: bool
+    ) -> Any:
+        """
+        Given an image width, height and target number of chunks this function will see if the image
+        can be fit into any of the canvases that can be build from arranging the tiles in a grid.
+        If the image can be fit onto several canvases, it will return the canvas where the shorter edge
+        of the image will be largest.
+
+        If area_limit is set to True, the tie-breaking prefers the canvas where area is less than 2x the original area.
+        """
+        # Initialize the optimal canvas to None. If no canvas is found where image fits, function returns None.
+        optimal_canvas = None
+        optimal_image_width_height = None
+
+        scale = img_width / img_height
+
+        # Gather all potential supported image resolutions and iterate through them to find best match
+        potential_arrangements = [
+            item
+            for sublist in self._find_supported_aspect_ratios().values()
+            for item in sublist
+        ]
+        for n_w, n_h in potential_arrangements:
+            # Compute the canvas size
+            canvas_width, canvas_height = n_w * self.size, n_h * self.size
+
+            # Check if image can fit into the canvas without downsampling
+            if canvas_width >= img_width and canvas_height >= img_height:
+                # If we did not find a good canvas yet, we will use the current one
+                if optimal_canvas is None:
+                    # Set optimal canvas and determine the actual image height and width in the canvas with aspect ratio preserving resampling
+                    optimal_canvas = (n_w, n_h)
+                    optimal_image_width_height = self._get_image_height_width(
+                        image_width=img_width,
+                        image_height=img_height,
+                        target_width=n_w * self.size,
+                        target_height=n_h * self.size,
+                    )
+                else:
+                    # If we already found an optimal canvas before, we will check if the shorter edge of the image will be larger than the current optimal canvas.
+                    # This means we can potentially upsample the image resolution which is beneficial to performance.
+                    image_width_height = self._get_image_height_width(
+                        image_width=img_width,
+                        image_height=img_height,
+                        target_width=n_w * self.size,
+                        target_height=n_h * self.size,
+                    )
+                    if area_limit:
+                        # Prioritize aspect ratio, and choose best within area limit when tied.
+                        curr_scale = image_width_height[0] / image_width_height[1]
+                        optim_scale = (
+                            optimal_image_width_height[0]
+                            / optimal_image_width_height[1]
+                        )
+                        if abs(scale - curr_scale) < abs(scale - optim_scale):
+                            # 1. optimize aspect ratio
+                            optimal_canvas = (n_w, n_h)
+                            optimal_image_width_height = image_width_height
+                        elif abs(scale - curr_scale) == abs(scale - optim_scale):
+                            # 2. optimize area
+                            if (
+                                image_width_height[0] * image_width_height[1]
+                                < 2 * img_width * img_height
+                            ):
+                                # 2.1 area is less than 2x the original area
+                                optimal_canvas = (n_w, n_h)
+                                optimal_image_width_height = image_width_height
+                    else:
+                        # NOTE: L3V dynamid tiling. Priortize biggest canvas.
+                        if (
+                            scale < 1.0
+                            and (image_width_height[0] >= optimal_image_width_height[0])
+                        ) or (
+                            scale >= 1.0
+                            and (image_width_height[1] >= optimal_image_width_height[1])
+                        ):
+                            optimal_canvas = (n_w, n_h)
+                            optimal_image_width_height = image_width_height
+        return optimal_canvas
+
+    def __call__(
+        self, image: Union[Image.Image, torch.Tensor] = None
+    ) -> Tuple[Any, Any]:
+        if self.use_thumbnail != "no":
+            thumbnail = self.thumbnail_transform(image)[0]
+
+        if isinstance(image, Image.Image):
+            w, h = image.size
+        else:
+            w, h = image.shape[-2:]
+
+        # Check if the image can be fit to the canvas without downsampling
+        ar = self._fit_image_to_canvas(
+            img_width=w, img_height=h, area_limit=self.area_limit
+        )
+        if ar is None:
+            # If we did not find a canvas, we have to find the closest aspect ratio and downsample the image
+            ar = self._find_closest_aspect_ratio(img_width=w, img_height=h)
+
+        image = F.resize(
+            image,
+            (ar[1] * self.size, ar[0] * self.size),  # (h, w)
+            interpolation=InterpolationMode.BICUBIC,
+        )
+        image = self._pad(image, ar[0] * self.size, ar[1] * self.size)
+
+        if isinstance(image, Image.Image):
+            image = self.to_tensor(image)
+        else:
+            image = F.convert_image_dtype(image, torch.float32)
+
+        image = self.normalize(image)
+        image = self._split(image, ar[0], ar[1])  # type: ignore
+        if self.use_thumbnail == "before":
+            image = torch.cat((thumbnail, image), dim=1)
+        elif self.use_thumbnail == "after":
+            image = torch.cat((image, thumbnail), dim=1)
+        elif self.use_thumbnail == "both":
+            image = torch.cat((thumbnail, image, thumbnail), dim=1)
+
+        return image, ar
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 2b432553b06a..1c35f53bda14 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -298,7 +298,7 @@ class PerceptionLMForConditionalGeneration(
 ):
     def __init__(self, config: PerceptionLMConfig):
         super().__init__(config)
-        self.vision_tower = AutoModel.from_config(config.vision_config)
+        # self.vision_tower = AutoModel.from_config(config.vision_config)
 
         self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
         self.vocab_size = config.text_config.vocab_size
@@ -363,28 +363,28 @@ def get_image_features(
 
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
         # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
-        image_outputs = self.vision_tower(
-            pixel_values, output_hidden_states=True, **kwargs
-        )
-
-        # If we have one vision feature layer, return the corresponding hidden states,
-        # otherwise, select the hidden states of each feature layer and concatenate them
-        if isinstance(vision_feature_layer, int):
-            selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
-            if vision_feature_select_strategy == "default":
-                selected_image_feature = selected_image_feature[:, 1:]
-        else:
-            hs_pool = [
-                image_outputs.hidden_states[layer_idx]
-                for layer_idx in vision_feature_layer
-            ]
-            # For default; crop CLS from each hidden state in the hidden state pool
-            if vision_feature_select_strategy == "default":
-                hs_pool = [hs[:, 1:] for hs in hs_pool]
-            selected_image_feature = torch.cat(hs_pool, dim=-1)
+        # image_outputs = self.vision_tower(
+        #     pixel_values, output_hidden_states=True, **kwargs
+        # )
+
+        # # If we have one vision feature layer, return the corresponding hidden states,
+        # # otherwise, select the hidden states of each feature layer and concatenate them
+        # if isinstance(vision_feature_layer, int):
+        #     selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+        #     if vision_feature_select_strategy == "default":
+        #         selected_image_feature = selected_image_feature[:, 1:]
+        # else:
+        #     hs_pool = [
+        #         image_outputs.hidden_states[layer_idx]
+        #         for layer_idx in vision_feature_layer
+        #     ]
+        #     # For default; crop CLS from each hidden state in the hidden state pool
+        #     if vision_feature_select_strategy == "default":
+        #         hs_pool = [hs[:, 1:] for hs in hs_pool]
+        #     selected_image_feature = torch.cat(hs_pool, dim=-1)
         image_features = torch.load(
             "/checkpoint/vision_encoder/smhu/debug/0/h_img_dump_0.pt"
-        ).to(selected_image_feature)
+        ).to(pixel_values.device).to(torch.bfloat16)
         image_features = self.multi_modal_projector(image_features)
         return image_features
 
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index cb989af91830..fb472f3433c4 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -81,14 +81,16 @@ def __init__(
         patch_size=None,
         vision_feature_select_strategy=None,
         chat_template=None,
-        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        image_token="<|image|>",  # set the default and let users change if they have peculiar special tokens in rare cases
         num_additional_image_tokens=0,
+        pooling_ratio=2,
         **kwargs,
     ):
         self.patch_size = patch_size
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.pooling_ratio = pooling_ratio
         self.image_token_id = (
             tokenizer.image_token_id
             if getattr(tokenizer, "image_token_id", None)
@@ -147,6 +149,7 @@ def __call__(
             **kwargs,
         )
         if images is not None:
+            print("image_processor class", self.image_processor.__class__)
             image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         else:
             image_inputs = {}
@@ -161,13 +164,14 @@ def __call__(
         if image_inputs.get("pixel_values") is not None:
             # Replace the image token with the expanded image token sequence
             pixel_values = image_inputs["pixel_values"]
+            print("pixel_values", pixel_values.shape)
             height, width = get_image_size(to_numpy_array(pixel_values[0]))
-            num_image_tokens = (height // self.patch_size) * (
-                width // self.patch_size
-            ) + self.num_additional_image_tokens
-            if self.vision_feature_select_strategy == "default":
-                num_image_tokens -= 1
-
+            num_tiles = pixel_values[0].shape[0]
+            num_image_tokens = (height // self.patch_size // self.pooling_ratio) * (
+                width // self.patch_size // self.pooling_ratio
+            ) * num_tiles
+            print("num_image_tokens", num_image_tokens)
+            print("self.image_token", self.image_token)
             prompt_strings = []
             for sample in text:
                 sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
diff --git a/test.py b/test.py
new file mode 100644
index 000000000000..a555deccc73a
--- /dev/null
+++ b/test.py
@@ -0,0 +1,48 @@
+import torch
+
+from transformers import AutoProcessor
+from transformers import PerceptionLMForConditionalGeneration
+
+processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b", use_fast=True)
+print(type(processor))
+
+model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to(torch.bfloat16).to("cuda")
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/14496_0.PNG",
+            },
+            {"type": "text", "text": "Describe the bar plot in the image."},
+        ],
+    }
+]
+
+print(model.config)
+
+
+inputs = processor.apply_chat_template(
+    conversation,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+)
+original_token_ids = inputs["input_ids"].cpu().numpy().tolist()
+token_ids = torch.load("/checkpoint/vision_encoder/smhu/debug/0/token_values_dump_0.pt")
+desired_token_ids = token_ids.cpu().numpy().tolist()
+
+assert original_token_ids == desired_token_ids
+
+inputs = inputs.to(model.device)
+torch.save(inputs['pixel_values'], "/checkpoint/vision_encoder/smhu/debug/0/pixel_values_dump_0.pt")
+generate_ids = model.generate(**inputs, max_new_tokens=256)
+# Remove input_ids from generate_ids to get only the newly generated tokens
+input_length = inputs["input_ids"].shape[1]
+generate_ids_without_inputs = generate_ids[:, input_length:]
+
+print(generate_ids_without_inputs.cpu().numpy().tolist())
+for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
+    print(output)

From 2a1169b0879ca66e864b671779a8e9ca0946439b Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 30 Apr 2025 22:39:04 +0000
Subject: [PATCH 04/65] First version that reproduced PLM output using PE from
 timm.

---
 .../models/auto/configuration_auto.py         |  2 +
 .../configuration_perception_lm.py            | 32 ++++-----
 .../convert_perception_lm_weights_to_hf.py    | 21 ++----
 .../image_processing_perception_lm_fast.py    |  4 +-
 .../perception_lm/modeling_perception_lm.py   | 67 +++++++++++--------
 5 files changed, 67 insertions(+), 59 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 637b4f093512..a68fe785763a 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -268,6 +268,7 @@
         ("pegasus", "PegasusConfig"),
         ("pegasus_x", "PegasusXConfig"),
         ("perceiver", "PerceiverConfig"),
+        ("perception_encoder", "PerceptionEncoderConfig"),
         ("persimmon", "PersimmonConfig"),
         ("phi", "PhiConfig"),
         ("phi3", "Phi3Config"),
@@ -665,6 +666,7 @@
         ("pegasus", "Pegasus"),
         ("pegasus_x", "PEGASUS-X"),
         ("perceiver", "Perceiver"),
+        ("perception_encoder", "PerceptionEncoder"),
         ("persimmon", "Persimmon"),
         ("phi", "Phi"),
         ("phi3", "Phi3"),
diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index 38ac3462f57f..786cb2aebcdc 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -20,6 +20,19 @@
 
 logger = logging.get_logger(__name__)
 
+class PerceptionEncoderConfig(PretrainedConfig):
+    image_size: int = 448
+    patch_size: int = 14
+    width: int = 1024
+    layers: int = 23
+    heads: int = 16
+    use_cls_token: bool = True
+    use_abs_posemb: bool = True
+    ls_init_value: float = 0.1
+    drop_path: float = 0.1
+    mlp_ratio: float = 4.0
+    use_ln_post: bool = False
+    pool_type: str = "none"
 
 class PerceptionLMConfig(PretrainedConfig):
     r"""
@@ -107,22 +120,11 @@ def __init__(
         self.vision_feature_layer = vision_feature_layer
 
         if isinstance(vision_config, dict):
-            vision_config["model_type"] = (
-                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
-            )
-            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+            vision_config = PerceptionEncoderConfig(**vision_config)
+        elif isinstance(vision_config, PerceptionEncoderConfig):
+            vision_config = vision_config
         elif vision_config is None:
-            vision_config = CONFIG_MAPPING["clip_vision_model"](
-                intermediate_size=4096,
-                hidden_size=1024,
-                patch_size=14,
-                image_size=336,
-                num_hidden_layers=24,
-                num_attention_heads=16,
-                vocab_size=32000,
-                projection_dim=768,
-            )
-
+            vision_config = PerceptionEncoderConfig()
         self.vision_config = vision_config
 
         if isinstance(text_config, dict):
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 769fa3dcd6e3..47b206fbba81 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -23,17 +23,16 @@
 from tokenizers import AddedToken, processors
 
 from transformers import (
-    CLIPVisionConfig,
     GenerationConfig,
     LlamaConfig,
     LlamaForCausalLM,
     LlamaTokenizer,
     PreTrainedTokenizerFast,
-    SiglipVisionConfig,
 )
 from transformers.convert_slow_tokenizer import TikTokenConverter
 from transformers.models.perception_lm.configuration_perception_lm import (
     PerceptionLMConfig,
+    PerceptionEncoderConfig,
 )
 from transformers.models.perception_lm.image_processing_perception_lm_fast import PerceptionLMImageProcessorFast
 from transformers.models.perception_lm.modeling_perception_lm import (
@@ -93,7 +92,6 @@
 """
 
 KEYS_TO_MODIFY_MAPPING = {
-    "model.vision_tower.": "",
     "model.mm_projector": "multi_modal_projector",
     "model": "model.model",
     "vision_model.model": "vision_model",
@@ -309,6 +307,9 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
                 "vision_projector.projector.2.bias"
             ],
         }
+        for k, v in loaded.items():
+            if "vision_model" in k:
+                state_dict[k] = v
         state_dict = convert_state_dict_to_hf(state_dict)
         for k, v in state_dict.items():
             index_dict["weight_map"][k] = filename
@@ -362,17 +363,9 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             tie_word_embeddings=tie_word_embeddings,
         )
 
-        vision_config = CLIPVisionConfig(
-            hidden_size=1024,
-            image_size=336,
-            intermediate_size=4096,
-            num_attention_heads=16,
-            num_hidden_layers=24,
-            patch_size=14,
-            projection_dim=768,
-            vocab_size=32000,
-        )
-
+        vision_config = PerceptionEncoderConfig(**model_params["vision_model"])
+        print("vision_config: ", vision_config)
+        
         config = PerceptionLMConfig(
             text_config=text_config.to_dict(),
             vision_config=vision_config.to_dict(),
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 2787cd1e8a7d..c71dfaf724ac 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -117,13 +117,11 @@ def _preprocess(
         grouped_images, grouped_images_index = group_images_by_shape(images)
         processed_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
-            print("shape", shape)
             stacked_images, _ = self.image_transform(stacked_images)
-            print("stacked_images shape", stacked_images.shape)
+            print("stacked_images shape: ", stacked_images.shape)
             processed_images_grouped[shape] = stacked_images
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
         processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
-
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
 
 
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 1c35f53bda14..a4e3ef049d39 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -35,8 +35,9 @@
     logging,
     replace_return_docstrings,
 )
+from timm.models.pe import PE
 from ..auto import AutoModel, AutoModelForCausalLM
-from .configuration_perception_lm import PerceptionLMConfig
+from .configuration_perception_lm import PerceptionLMConfig, PerceptionEncoderConfig
 
 
 logger = logging.get_logger(__name__)
@@ -47,6 +48,34 @@
 _CHECKPOINT_FOR_DOC = "facebook/Perception-LM-1B"
 
 
+class PerceptionEncoder(PE):
+    def __init__(self, config: PerceptionEncoderConfig):
+        assert config.pool_type == "none"
+        self.use_cls_token = config.use_cls_token
+        # Converting configs to timm PE args
+        super().__init__(
+            img_size=config.image_size,
+            patch_size=config.patch_size,
+            width=config.width,
+            layers=config.layers,
+            heads=config.heads,
+            mlp_ratio=config.mlp_ratio,
+            use_cls_token=config.use_cls_token,
+            use_abs_posemb=config.use_abs_posemb,
+            use_ln_post=config.use_ln_post,
+            ls_init_value=config.ls_init_value,
+            output_dim=config.width,
+            use_attn_pool=False,
+            use_proj=False,
+        )
+    
+    def forward(self, x):
+        x = super().forward(x)
+        if self.use_cls_token:
+            return x[:, 1:, :]
+        else:
+            return x
+
 @dataclass
 # Copied from transformers.models.llava.modeling_llava.LlavaCausalLMOutputWithPast with Llava->PerceptionLM
 class PerceptionLMCausalLMOutputWithPast(ModelOutput):
@@ -132,7 +161,7 @@ def forward(self, x):
 class PerceptionLMMultiModalProjector(nn.Module):
     def __init__(self, config: PerceptionLMConfig):
         super().__init__()
-        input_size = config.vision_config.hidden_size
+        input_size = config.vision_config.width
         output_size = config.text_config.hidden_size
         self.projector = nn.Sequential(
             nn.Linear(
@@ -298,7 +327,7 @@ class PerceptionLMForConditionalGeneration(
 ):
     def __init__(self, config: PerceptionLMConfig):
         super().__init__(config)
-        # self.vision_tower = AutoModel.from_config(config.vision_config)
+        self.vision_model = PerceptionEncoder(config.vision_config)
 
         self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
         self.vocab_size = config.text_config.vocab_size
@@ -363,29 +392,13 @@ def get_image_features(
 
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
         # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
-        # image_outputs = self.vision_tower(
-        #     pixel_values, output_hidden_states=True, **kwargs
-        # )
-
-        # # If we have one vision feature layer, return the corresponding hidden states,
-        # # otherwise, select the hidden states of each feature layer and concatenate them
-        # if isinstance(vision_feature_layer, int):
-        #     selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
-        #     if vision_feature_select_strategy == "default":
-        #         selected_image_feature = selected_image_feature[:, 1:]
-        # else:
-        #     hs_pool = [
-        #         image_outputs.hidden_states[layer_idx]
-        #         for layer_idx in vision_feature_layer
-        #     ]
-        #     # For default; crop CLS from each hidden state in the hidden state pool
-        #     if vision_feature_select_strategy == "default":
-        #         hs_pool = [hs[:, 1:] for hs in hs_pool]
-        #     selected_image_feature = torch.cat(hs_pool, dim=-1)
-        image_features = torch.load(
-            "/checkpoint/vision_encoder/smhu/debug/0/h_img_dump_0.pt"
-        ).to(pixel_values.device).to(torch.bfloat16)
-        image_features = self.multi_modal_projector(image_features)
+        print("pixel_values shape: ", pixel_values.shape)
+        image_outputs = self.vision_model(pixel_values[0])
+        # image_features = torch.load(
+        #     "/checkpoint/vision_encoder/smhu/debug/0/h_img_dump_0.pt"
+        # ).to(pixel_values.device).to(torch.bfloat16)
+        print("image_outputs shape: ", image_outputs.shape)
+        image_features = self.multi_modal_projector(image_outputs)
         return image_features
 
     @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
@@ -489,7 +502,7 @@ def forward(
 
         if pixel_values is not None:
             image_features = self.get_image_features(
-                pixel_values=pixel_values,
+                pixel_values=pixel_values.to(inputs_embeds),
                 vision_feature_layer=vision_feature_layer,
                 vision_feature_select_strategy=vision_feature_select_strategy,
                 image_sizes=image_sizes,

From e093e20634297015eea22df517fd2022215a064b Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Thu, 1 May 2025 00:28:55 +0000
Subject: [PATCH 05/65] Simplify and fix tie_word_embeddings

---
 .../configuration_perception_lm.py            | 19 +------
 .../convert_perception_lm_weights_to_hf.py    | 51 +++++--------------
 .../perception_lm/modeling_perception_lm.py   | 39 +-------------
 test.py                                       |  8 +--
 4 files changed, 16 insertions(+), 101 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index 786cb2aebcdc..f90db925a6e8 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -98,27 +98,10 @@ def __init__(
         vision_config=None,
         text_config=None,
         projector_pooling_ratio=1,
-        image_token_index=32000,
-        projector_hidden_act="gelu",
-        vision_feature_select_strategy="default",
-        vision_feature_layer=-2,
-        image_seq_length=576,
-        multimodal_projector_bias=True,
+        image_token_index=128002,
         **kwargs,
     ):
         self.image_token_index = image_token_index
-        self.projector_hidden_act = projector_hidden_act
-        self.image_seq_length = image_seq_length
-
-        if vision_feature_select_strategy not in ["default", "full"]:
-            raise ValueError(
-                "vision_feature_select_strategy should be one of 'default', 'full'."
-                f"Got: {vision_feature_select_strategy}"
-            )
-
-        self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.vision_feature_layer = vision_feature_layer
-
         if isinstance(vision_config, dict):
             vision_config = PerceptionEncoderConfig(**vision_config)
         elif isinstance(vision_config, PerceptionEncoderConfig):
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 47b206fbba81..6366ac9a818e 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -25,7 +25,6 @@
 from transformers import (
     GenerationConfig,
     LlamaConfig,
-    LlamaForCausalLM,
     LlamaTokenizer,
     PreTrainedTokenizerFast,
 )
@@ -91,15 +90,6 @@
 
 """
 
-KEYS_TO_MODIFY_MAPPING = {
-    "model.mm_projector": "multi_modal_projector",
-    "model": "model.model",
-    "vision_model.model": "vision_model",
-    "lm_head": "language_model.lm_head",
-    "model.model": "language_model.model",
-    "language_model.model.image_newline": "image_newline",
-}
-
 BOS_ADDED_TOKEN = AddedToken(
     "<|begin_of_text|>",
     single_word=False,
@@ -142,17 +132,6 @@
 }
 
 
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value.to(torch.float16)
-    return new_state_dict
-
-
 def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
     return multiple_of * (
         (int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of
@@ -244,40 +223,39 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
             assert num_shards == 1, "PerceptionLM does not support sharded weights"
             state_dict = {
-                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
+                f"language_model.model.layers.{layer_i}.self_attn.q_proj.weight": permute(
                     loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads
                 ),
-                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
+                f"language_model.model.layers.{layer_i}.self_attn.k_proj.weight": permute(
                     loaded[f"layers.{layer_i}.attention.wk.weight"],
                     n_heads=num_key_value_heads,
                     dim1=key_value_dim,
                 ),
-                f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[
+                f"language_model.model.layers.{layer_i}.self_attn.v_proj.weight": loaded[
                     f"layers.{layer_i}.attention.wv.weight"
                 ],
-                f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[
+                f"language_model.model.layers.{layer_i}.self_attn.o_proj.weight": loaded[
                     f"layers.{layer_i}.attention.wo.weight"
                 ],
-                f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[
+                f"language_model.model.layers.{layer_i}.mlp.gate_proj.weight": loaded[
                     f"layers.{layer_i}.feed_forward.w1.weight"
                 ],
-                f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[
+                f"language_model.model.layers.{layer_i}.mlp.down_proj.weight": loaded[
                     f"layers.{layer_i}.feed_forward.w2.weight"
                 ],
-                f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[
+                f"language_model.model.layers.{layer_i}.mlp.up_proj.weight": loaded[
                     f"layers.{layer_i}.feed_forward.w3.weight"
                 ],
-                f"model.layers.{layer_i}.input_layernorm.weight": loaded[
+                f"language_model.model.layers.{layer_i}.input_layernorm.weight": loaded[
                     f"layers.{layer_i}.attention_norm.weight"
                 ],
-                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
+                f"language_model.model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
                     f"layers.{layer_i}.ffn_norm.weight"
                 ],
             }
-            state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = (
+            state_dict[f"language_model.model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = (
                 inv_freq
             )
-            state_dict = convert_state_dict_to_hf(state_dict)
             for k, v in state_dict.items():
                 index_dict["weight_map"][k] = filename
                 param_count += v.numel()
@@ -287,9 +265,9 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
 
         state_dict = {
-            "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
-            "model.norm.weight": loaded["norm.weight"],
-            "model.lm_head.weight": (
+            "language_model.model.embed_tokens.weight": loaded["tok_embeddings.weight"],
+            "language_model.model.norm.weight": loaded["norm.weight"],
+            "language_model.lm_head.weight": (
                 loaded["output.weight"]
                 if not tie_word_embeddings
                 else loaded["tok_embeddings.weight"]
@@ -310,7 +288,6 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         for k, v in loaded.items():
             if "vision_model" in k:
                 state_dict[k] = v
-        state_dict = convert_state_dict_to_hf(state_dict)
         for k, v in state_dict.items():
             index_dict["weight_map"][k] = filename
             param_count += v.numel()
@@ -364,8 +341,6 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         )
 
         vision_config = PerceptionEncoderConfig(**model_params["vision_model"])
-        print("vision_config: ", vision_config)
-        
         config = PerceptionLMConfig(
             text_config=text_config.to_dict(),
             vision_config=vision_config.to_dict(),
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index a4e3ef049d39..263d8229c21d 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -64,6 +64,7 @@ def __init__(self, config: PerceptionEncoderConfig):
             use_abs_posemb=config.use_abs_posemb,
             use_ln_post=config.use_ln_post,
             ls_init_value=config.ls_init_value,
+            drop_path=config.drop_path,
             output_dim=config.width,
             use_attn_pool=False,
             use_proj=False,
@@ -365,8 +366,6 @@ def get_decoder(self):
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
-        vision_feature_layer: Union[int, List[int]],
-        vision_feature_select_strategy: str,
         **kwargs,
     ):
         """
@@ -385,18 +384,8 @@ def get_image_features(
         Returns:
             image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
         """
-        if vision_feature_select_strategy not in ["default", "full"]:
-            raise ValueError(
-                f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
-            )
-
-        kwargs = {k: v for k, v in kwargs.items() if v is not None}
-        # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
         print("pixel_values shape: ", pixel_values.shape)
         image_outputs = self.vision_model(pixel_values[0])
-        # image_features = torch.load(
-        #     "/checkpoint/vision_encoder/smhu/debug/0/h_img_dump_0.pt"
-        # ).to(pixel_values.device).to(torch.bfloat16)
         print("image_outputs shape: ", image_outputs.shape)
         image_features = self.multi_modal_projector(image_outputs)
         return image_features
@@ -476,17 +465,6 @@ def forward(
         return_dict = (
             return_dict if return_dict is not None else self.config.use_return_dict
         )
-        vision_feature_layer = (
-            vision_feature_layer
-            if vision_feature_layer is not None
-            else self.config.vision_feature_layer
-        )
-        vision_feature_select_strategy = (
-            vision_feature_select_strategy
-            if vision_feature_select_strategy is not None
-            else self.config.vision_feature_select_strategy
-        )
-
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError(
                 "You must specify exactly one of input_ids or inputs_embeds"
@@ -503,33 +481,18 @@ def forward(
         if pixel_values is not None:
             image_features = self.get_image_features(
                 pixel_values=pixel_values.to(inputs_embeds),
-                vision_feature_layer=vision_feature_layer,
-                vision_feature_select_strategy=vision_feature_select_strategy,
-                image_sizes=image_sizes,
             )
 
             special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
             special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
                 inputs_embeds.device
             )
-            # if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
-            #     n_image_tokens = (input_ids == self.config.image_token_id).sum()
-            #     n_image_features = image_features.shape[0] * image_features.shape[1]
-            #     raise ValueError(
-            #         f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-            #     )
             image_features = image_features.to(
                 inputs_embeds.device, inputs_embeds.dtype
             )
             inputs_embeds = inputs_embeds.masked_scatter(
                 special_image_mask, image_features
             )
-            # print(inputs_embeds.shape)
-            # occhi_embeds = torch.load(
-            #     "/checkpoint/vision_encoder/smhu/debug/0/h_post_stitching_dump_0.pt"
-            # )
-            # print(occhi_embeds.shape)
-            # inputs_embeds = occhi_embeds
 
         outputs = self.language_model(
             attention_mask=attention_mask,
diff --git a/test.py b/test.py
index a555deccc73a..c26c3694aa1a 100644
--- a/test.py
+++ b/test.py
@@ -30,14 +30,8 @@
     return_dict=True,
     return_tensors="pt",
 )
-original_token_ids = inputs["input_ids"].cpu().numpy().tolist()
-token_ids = torch.load("/checkpoint/vision_encoder/smhu/debug/0/token_values_dump_0.pt")
-desired_token_ids = token_ids.cpu().numpy().tolist()
-
-assert original_token_ids == desired_token_ids
-
 inputs = inputs.to(model.device)
-torch.save(inputs['pixel_values'], "/checkpoint/vision_encoder/smhu/debug/0/pixel_values_dump_0.pt")
+# torch.save(inputs['pixel_values'], "/checkpoint/vision_encoder/smhu/debug/0/pixel_values_dump_0.pt")
 generate_ids = model.generate(**inputs, max_new_tokens=256)
 # Remove input_ids from generate_ids to get only the newly generated tokens
 input_length = inputs["input_ids"].shape[1]

From 31aa91b5468c32e1fff51503d6be850baecca3ee Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 2 May 2025 02:00:08 +0000
Subject: [PATCH 06/65] Use PIL resize. Simplify converstion.

---
 .../convert_perception_lm_weights_to_hf.py    | 11 ++--
 .../models/perception_lm/image_transform.py   | 59 ++++++++++---------
 .../perception_lm/modeling_perception_lm.py   |  8 ++-
 test.py                                       |  6 +-
 4 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 6366ac9a818e..d11a6654ad0c 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -154,7 +154,7 @@ def write_model(
     params,
     image_token_id,
     safe_serialization=True,
-    vocab_size=None,
+    tokenizer=None,
     num_shards=None,
     push_to_hub=False,
 ):
@@ -307,8 +307,8 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             model_params["multiple_of"] if "multiple_of" in model_params else 256
         )
 
-        bos_token_id = 128000
-        eos_token_id = [128001, 128008, 128009]
+        bos_token_id = tokenizer.convert_tokens_to_ids("<|begin_of_text|>")
+        eos_token_id = [tokenizer.convert_tokens_to_ids(t) for t in ["<|end_of_text|>", "<|eot_id|>"]]
 
         use_scaled_rope = model_params["use_scaled_rope"]
         if use_scaled_rope:
@@ -331,7 +331,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             num_hidden_layers=model_params["n_layers"],
             rms_norm_eps=model_params["norm_eps"],
             num_key_value_heads=num_key_value_heads,
-            vocab_size=vocab_size,
+            vocab_size=len(tokenizer),
             rope_theta=base,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
@@ -560,14 +560,13 @@ def main():
         params=params,
         push_to_hub=args.push_to_hub,
     )
-    vocab_size = len(tokenizer)
     write_model(
         model_path=args.output_dir,
         input_base_path=args.input_dir,
         params=params,
         image_token_id=tokenizer.image_token_id,
         safe_serialization=args.safe_serialization,
-        vocab_size=vocab_size,
+        tokenizer=tokenizer,
         num_shards=args.num_shards,
         push_to_hub=args.push_to_hub,
     )
diff --git a/src/transformers/models/perception_lm/image_transform.py b/src/transformers/models/perception_lm/image_transform.py
index d6b6d31ef0b5..8f5b94cebb7c 100644
--- a/src/transformers/models/perception_lm/image_transform.py
+++ b/src/transformers/models/perception_lm/image_transform.py
@@ -3,7 +3,7 @@
 import math
 from functools import reduce
 from logging import getLogger
-from typing import Any, Callable, Tuple, Union
+from typing import Any, Callable, Tuple, Union, Sequence
 
 import numpy as np
 import torch
@@ -11,6 +11,8 @@
 from PIL import Image
 from torchvision.transforms import functional as F
 from torchvision.transforms.functional import InterpolationMode
+from torchvision.transforms import ToPILImage, PILToTensor
+
 
 logger = getLogger()
 
@@ -19,6 +21,29 @@
 STD = (0.5, 0.5, 0.5)
 
 
+"""
+Resize the image to the given size. Supports both PIL images and torch.Tensor.
+If the image is a tensor, it's supposed to be a batch of images with shape (B, C, H, W) and dtype uint8.
+If use_pil_resize is True, the images will be resized using PIL implementation of interpolation.
+"""
+
+
+def _resize(
+    image: Union[Image.Image, torch.Tensor],
+    size: Sequence[int],
+    use_pil_resize: bool = True,
+) -> Union[Image.Image, torch.Tensor]:
+    if isinstance(image, torch.Tensor) and use_pil_resize:
+        ims = []
+        for im in image:
+            im = ToPILImage()(im)
+            im = F.resize(im, size, interpolation=InterpolationMode.BICUBIC)
+            ims.append(PILToTensor()(im))
+        return torch.stack(ims, dim=0)
+    else:
+        return F.resize(image, size, interpolation=InterpolationMode.BICUBIC)
+
+
 def get_image_transform(
     vision_input_type: str = "vanilla",
     image_res: int = 336,
@@ -84,12 +109,11 @@ def __call__(self, image: Union[Image.Image, torch.Tensor]):
         if isinstance(image, Image.Image):
             w, h = image.size
         else:
-            w, h = image.shape[-2:]
+            h, w = image.shape[-2:]
 
-        image = F.resize(
+        image = _resize(
             image,
             (self.size, self.size),
-            interpolation=InterpolationMode.BICUBIC,
         )
         if isinstance(image, Image.Image):
             image = self.to_tensor(image)
@@ -237,25 +261,6 @@ def _find_closest_aspect_ratio(self, img_width: int, img_height: int) -> Tuple:
         out = v[tgt_idx]
         return out
 
-    def _resize(
-        self, image: Image.Image, target_width: int, target_height: int
-    ) -> Image.Image:
-        # Resize longer edge to given size.
-        w, h = image.size
-        scale = w / h
-
-        if scale > 1.0:
-            # width > height
-            new_w = target_width
-            new_h = math.floor(new_w / scale)
-        else:
-            # height >= width
-            new_h = target_height
-            new_w = math.floor(new_h * scale)
-
-        image = F.resize(image, (new_h, new_w))
-        return image
-
     def _pad(
         self, image: Union[Image.Image, torch.Tensor], new_width: int, new_height: int
     ) -> Union[Image.Image, torch.Tensor]:
@@ -407,20 +412,21 @@ def __call__(
         if isinstance(image, Image.Image):
             w, h = image.size
         else:
-            w, h = image.shape[-2:]
+            h, w = image.shape[-2:]
 
         # Check if the image can be fit to the canvas without downsampling
         ar = self._fit_image_to_canvas(
             img_width=w, img_height=h, area_limit=self.area_limit
         )
+        print("orginal w, h", w, h)
         if ar is None:
             # If we did not find a canvas, we have to find the closest aspect ratio and downsample the image
             ar = self._find_closest_aspect_ratio(img_width=w, img_height=h)
 
-        image = F.resize(
+        print("closest aspect ratio", ar)
+        image = _resize(
             image,
             (ar[1] * self.size, ar[0] * self.size),  # (h, w)
-            interpolation=InterpolationMode.BICUBIC,
         )
         image = self._pad(image, ar[0] * self.size, ar[1] * self.size)
 
@@ -437,5 +443,4 @@ def __call__(
             image = torch.cat((image, thumbnail), dim=1)
         elif self.use_thumbnail == "both":
             image = torch.cat((thumbnail, image, thumbnail), dim=1)
-
         return image, ar
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 263d8229c21d..2654ad19f856 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -69,7 +69,7 @@ def __init__(self, config: PerceptionEncoderConfig):
             use_attn_pool=False,
             use_proj=False,
         )
-    
+
     def forward(self, x):
         x = super().forward(x)
         if self.use_cls_token:
@@ -77,6 +77,7 @@ def forward(self, x):
         else:
             return x
 
+
 @dataclass
 # Copied from transformers.models.llava.modeling_llava.LlavaCausalLMOutputWithPast with Llava->PerceptionLM
 class PerceptionLMCausalLMOutputWithPast(ModelOutput):
@@ -385,8 +386,13 @@ def get_image_features(
             image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
         """
         print("pixel_values shape: ", pixel_values.shape)
+        # torch.save(pixel_values, "/tmp/occhi/0/_1.pt")
+        # pixel_values = (
+        #     torch.load("/tmp/occhi/0/images_dump_1.pt").unsqueeze(0).to(pixel_values)
+        # )
         image_outputs = self.vision_model(pixel_values[0])
         print("image_outputs shape: ", image_outputs.shape)
+        # image_outputs = torch.load("/tmp/occhi/0/h_img_dump_0.pt").to(image_outputs)
         image_features = self.multi_modal_projector(image_outputs)
         return image_features
 
diff --git a/test.py b/test.py
index c26c3694aa1a..deb547d61019 100644
--- a/test.py
+++ b/test.py
@@ -3,17 +3,17 @@
 from transformers import AutoProcessor
 from transformers import PerceptionLMForConditionalGeneration
 
-processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b", use_fast=True)
+processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_8b", use_fast=True)
 print(type(processor))
 
-model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to(torch.bfloat16).to("cuda")
+model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_8b").to(torch.bfloat16).to("cuda")
 conversation = [
     {
         "role": "user",
         "content": [
             {
                 "type": "image",
-                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/14496_0.PNG",
+                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/3263_0.JPEG",
             },
             {"type": "text", "text": "Describe the bar plot in the image."},
         ],

From 91de705ae54776b2f16ae875025f2e6aa06a9c58 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 7 May 2025 23:42:54 +0000
Subject: [PATCH 07/65] First version that works with video input.

---
 .../convert_perception_lm_weights_to_hf.py    | 27 ++++++-----
 .../image_processing_perception_lm_fast.py    | 37 +++++++++++----
 .../perception_lm/processing_perception_lm.py | 11 ++---
 test.py                                       |  6 +--
 test_video.py                                 | 45 +++++++++++++++++++
 5 files changed, 99 insertions(+), 27 deletions(-)
 create mode 100644 test_video.py

diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index d11a6654ad0c..9874b0eb86ef 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -33,11 +33,15 @@
     PerceptionLMConfig,
     PerceptionEncoderConfig,
 )
-from transformers.models.perception_lm.image_processing_perception_lm_fast import PerceptionLMImageProcessorFast
+from transformers.models.perception_lm.image_processing_perception_lm_fast import (
+    PerceptionLMImageProcessorFast,
+)
 from transformers.models.perception_lm.modeling_perception_lm import (
     PerceptionLMForConditionalGeneration,
 )
-from transformers.models.perception_lm.processing_perception_lm import PerceptionLMProcessor
+from transformers.models.perception_lm.processing_perception_lm import (
+    PerceptionLMProcessor,
+)
 
 
 try:
@@ -253,9 +257,9 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
                     f"layers.{layer_i}.ffn_norm.weight"
                 ],
             }
-            state_dict[f"language_model.model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = (
-                inv_freq
-            )
+            state_dict[
+                f"language_model.model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"
+            ] = inv_freq
             for k, v in state_dict.items():
                 index_dict["weight_map"][k] = filename
                 param_count += v.numel()
@@ -308,7 +312,10 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         )
 
         bos_token_id = tokenizer.convert_tokens_to_ids("<|begin_of_text|>")
-        eos_token_id = [tokenizer.convert_tokens_to_ids(t) for t in ["<|end_of_text|>", "<|eot_id|>"]]
+        eos_token_id = [
+            tokenizer.convert_tokens_to_ids(t)
+            for t in ["<|end_of_text|>", "<|eot_id|>"]
+        ]
 
         use_scaled_rope = model_params["use_scaled_rope"]
         if use_scaled_rope:
@@ -409,7 +416,7 @@ def __init__(
             "{{- '<|eot_id|>' }}"
             "{%- for message in messages %}"
             "{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' }}"
-            "{%- for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
+            "{%- for content in message['content'] | selectattr('type', 'in', ['image', 'video']) %}"
             "{{ '<|image|>' }}"
             "{%- endfor %}"
             "{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
@@ -485,15 +492,15 @@ def write_tokenizer(
         "processor_class": "PerceptionLMProcessor",
     }
 
-    preprocessor_config = {
+    image_preprocessor_config = {
         "image_processor_type": "PerceptionLMImageProcessorFast",
         "vision_input_type": params["data"]["vision_input_type"],
         "image_res": params["model"]["vision_model"]["image_size"],
         "max_num_tiles": params["data"]["max_num_tiles"],
+        "max_frame_tiles": 1,
         "normalize_img": True,
     }
-
-    image_preprocessor = PerceptionLMImageProcessorFast(**preprocessor_config)
+    image_preprocessor = PerceptionLMImageProcessorFast(**image_preprocessor_config)
     processor = PerceptionLMProcessor(
         image_processor=image_preprocessor,
         tokenizer=tokenizer,
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index c71dfaf724ac..adf25bd9b38b 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -16,6 +16,8 @@
 
 from typing import List, Optional, Tuple, Union
 
+import numpy as np
+
 from transformers.models.perception_lm.image_transform import get_image_transform
 
 from ...image_processing_utils import (
@@ -34,6 +36,7 @@
     IMAGENET_STANDARD_STD,
     ChannelDimension,
     ImageInput,
+    VideoInput,
     PILImageResampling,
     SizeDict,
     get_image_size,
@@ -89,10 +92,17 @@ def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> No
             max_num_tiles=kwargs.get("max_num_tiles", 36),
             normalize_img=kwargs.get("normalize_img", True),
         )
+        self.video_transform = get_image_transform(
+            vision_input_type="vanilla",
+            image_res=kwargs.get("image_res", 448),
+            max_num_tiles=kwargs.get("max_frame_tiles", 1),
+            normalize_img=kwargs.get("normalize_img", True),
+        )
 
     def to_dict(self):
         dictionary = super().to_dict()
         dictionary["image_transform"] = self.image_transform.to_dict()
+        dictionary["video_transform"] = self.video_transform.to_dict()
         return dictionary
 
     @add_start_docstrings(
@@ -109,20 +119,29 @@ def preprocess(self, images: ImageInput, **kwargs: Unpack[PerceptionLMFastImageP
     def _preprocess(
         self,
         images: List["torch.Tensor"],
+        videos: List["torch.Tensor"],
         return_tensors: Optional[Union[str, TensorType]],
         **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]
     ) -> BatchFeature:
         # Group images by size for batched transformation
         del kwargs
-        grouped_images, grouped_images_index = group_images_by_shape(images)
-        processed_images_grouped = {}
-        for shape, stacked_images in grouped_images.items():
-            stacked_images, _ = self.image_transform(stacked_images)
-            print("stacked_images shape: ", stacked_images.shape)
-            processed_images_grouped[shape] = stacked_images
-        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
-        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+        if images:
+            grouped_images, grouped_images_index = group_images_by_shape(images)
+            processed_images_grouped = {}
+            for shape, stacked_images in grouped_images.items():
+                stacked_images, _ = self.image_transform(stacked_images)
+                print("stacked_images shape: ", stacked_images.shape)
+                processed_images_grouped[shape] = stacked_images
+            processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+            processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+            return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+        elif videos:
+            videos = [torch.from_numpy(np.array(v)).flatten(0, 1).permute(0, 3, 1, 2) for v in videos]
+            processed_videos = [self.video_transform(v)[0].squeeze(1) for v in videos]
+            processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
+            return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)
+        else:
+            return BatchFeature(data={"pixel_values": None}, tensor_type=return_tensors)
 
 
 __all__ = ["PerceptionLMImageProcessorFast"]
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index fb472f3433c4..8930e7fbf33a 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -19,7 +19,7 @@
 from typing import List, Union
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput, get_image_size, to_numpy_array
+from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
@@ -81,7 +81,7 @@ def __init__(
         patch_size=None,
         vision_feature_select_strategy=None,
         chat_template=None,
-        image_token="<|image|>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        image_token="<|image|>", 
         num_additional_image_tokens=0,
         pooling_ratio=2,
         **kwargs,
@@ -103,7 +103,7 @@ def __call__(
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
         audio=None,
-        videos=None,
+        videos: VideoInput = None,
         **kwargs: Unpack[PerceptionLMProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -148,9 +148,10 @@ def __call__(
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
-        if images is not None:
+        if images is not None or videos is not None:
             print("image_processor class", self.image_processor.__class__)
-            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+            images = [] if images is None else images
+            image_inputs = self.image_processor(images=images, videos=videos, **output_kwargs["images_kwargs"])
         else:
             image_inputs = {}
 
diff --git a/test.py b/test.py
index deb547d61019..c26c3694aa1a 100644
--- a/test.py
+++ b/test.py
@@ -3,17 +3,17 @@
 from transformers import AutoProcessor
 from transformers import PerceptionLMForConditionalGeneration
 
-processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_8b", use_fast=True)
+processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b", use_fast=True)
 print(type(processor))
 
-model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_8b").to(torch.bfloat16).to("cuda")
+model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to(torch.bfloat16).to("cuda")
 conversation = [
     {
         "role": "user",
         "content": [
             {
                 "type": "image",
-                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/3263_0.JPEG",
+                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/14496_0.PNG",
             },
             {"type": "text", "text": "Describe the bar plot in the image."},
         ],
diff --git a/test_video.py b/test_video.py
new file mode 100644
index 000000000000..99382ecdbfc1
--- /dev/null
+++ b/test_video.py
@@ -0,0 +1,45 @@
+import torch
+
+from transformers import AutoProcessor
+from transformers import PerceptionLMForConditionalGeneration
+
+processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_3b")
+print(type(processor))
+
+model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_3b").to(torch.bfloat16).to("cuda")
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4",
+            },
+            {"type": "text", "text": "Can you describe the video in detail?"},
+        ],
+    }
+]
+
+print(model.config)
+
+
+inputs = processor.apply_chat_template(
+    conversation,
+    add_generation_prompt=True,
+    num_frames=32,
+    # video_fps=1,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    video_load_backend="decord",
+)
+inputs = inputs.to(model.device)
+# torch.save(inputs['pixel_values'], "/checkpoint/vision_encoder/smhu/debug/0/pixel_values_dump_0.pt")
+generate_ids = model.generate(**inputs, max_new_tokens=256)
+# Remove input_ids from generate_ids to get only the newly generated tokens
+input_length = inputs["input_ids"].shape[1]
+generate_ids_without_inputs = generate_ids[:, input_length:]
+
+print(generate_ids_without_inputs.cpu().numpy().tolist())
+for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
+    print(output)

From 71582cc66422237950d56d4b51e376761207fe94 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Thu, 8 May 2025 01:03:31 +0000
Subject: [PATCH 08/65] simplifed image preprocessing (not batched)

---
 .../image_processing_perception_lm_fast.py    | 57 ++++++++++++-------
 .../models/perception_lm/image_transform.py   | 37 +++++++-----
 .../perception_lm/modeling_perception_lm.py   | 16 +-----
 test.py                                       |  6 +-
 test_video.py                                 |  6 +-
 5 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index adf25bd9b38b..f06c28198650 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -112,36 +112,51 @@ def to_dict(self):
                 Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
         """,
     )
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> BatchFeature:
-        return super().preprocess(images, **kwargs)
-
-
-    def _preprocess(
-        self,
-        images: List["torch.Tensor"],
-        videos: List["torch.Tensor"],
-        return_tensors: Optional[Union[str, TensorType]],
-        **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]
-    ) -> BatchFeature:
-        # Group images by size for batched transformation
-        del kwargs
+    def preprocess(self, images: ImageInput, videos: VideoInput, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> BatchFeature:
+        return_tensors = kwargs.get("return_tensors", "pt")
         if images:
-            grouped_images, grouped_images_index = group_images_by_shape(images)
-            processed_images_grouped = {}
-            for shape, stacked_images in grouped_images.items():
-                stacked_images, _ = self.image_transform(stacked_images)
-                print("stacked_images shape: ", stacked_images.shape)
-                processed_images_grouped[shape] = stacked_images
-            processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+            processed_images = []
+            for image in images:
+                processed = [self.image_transform(im)[0] for im in image]
+                processed = torch.cat(processed, dim=0)
+                processed_images.append(processed)
             processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
             return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
         elif videos:
             videos = [torch.from_numpy(np.array(v)).flatten(0, 1).permute(0, 3, 1, 2) for v in videos]
-            processed_videos = [self.video_transform(v)[0].squeeze(1) for v in videos]
+            processed_videos = [self.video_transform(v)[0] for v in videos]
             processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
             return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)
         else:
             return BatchFeature(data={"pixel_values": None}, tensor_type=return_tensors)
 
 
+    # def _preprocess(
+    #     self,
+    #     images: List["torch.Tensor"],
+    #     videos: List["torch.Tensor"],
+    #     return_tensors: Optional[Union[str, TensorType]],
+    #     **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]
+    # ) -> BatchFeature:
+    #     # Group images by size for batched transformation
+    #     del kwargs
+    #     if images:
+    #         grouped_images, grouped_images_index = group_images_by_shape(images)
+    #         processed_images_grouped = {}
+    #         for shape, stacked_images in grouped_images.items():
+    #             stacked_images, _ = self.image_transform(stacked_images)
+    #             print("stacked_images shape: ", stacked_images.shape)
+    #             processed_images_grouped[shape] = stacked_images
+    #         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+    #         processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+    #         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+    #     elif videos:
+    #         videos = [torch.from_numpy(np.array(v)).flatten(0, 1).permute(0, 3, 1, 2) for v in videos]
+    #         processed_videos = [self.video_transform(v)[0].squeeze(1) for v in videos]
+    #         processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
+    #         return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)
+    #     else:
+    #         return BatchFeature(data={"pixel_values": None}, tensor_type=return_tensors)
+
+
 __all__ = ["PerceptionLMImageProcessorFast"]
diff --git a/src/transformers/models/perception_lm/image_transform.py b/src/transformers/models/perception_lm/image_transform.py
index 8f5b94cebb7c..f6c348c6ee23 100644
--- a/src/transformers/models/perception_lm/image_transform.py
+++ b/src/transformers/models/perception_lm/image_transform.py
@@ -114,6 +114,7 @@ def __call__(self, image: Union[Image.Image, torch.Tensor]):
         image = _resize(
             image,
             (self.size, self.size),
+            use_pil_resize=False,
         )
         if isinstance(image, Image.Image):
             image = self.to_tensor(image)
@@ -122,7 +123,7 @@ def __call__(self, image: Union[Image.Image, torch.Tensor]):
         image = self.normalize(image)
 
         # Add chunk dim to make it compatible with existing dataloaders
-        image = image.view(-1, 1, 3, self.size, self.size)
+        image = image.view(-1, 3, self.size, self.size)
         return image, (w, h)
 
 
@@ -272,21 +273,31 @@ def _pad(
             return F.pad(
                 image, (0, 0, new_width - image.shape[-1], new_height - image.shape[-2])
             )
-
+        
     def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
         # Split image into number of required tiles (width x height)
-        batch_size, num_channels, height, width = image.size()
-        image = image.view(
-            batch_size, num_channels, nch, height // nch, ncw, width // ncw
-        )
+        num_channels, height, width = image.size()
+        image = image.view(num_channels, nch, height // nch, ncw, width // ncw)
         # Permute dimensions to reorder the axes
-        image = image.permute(0, 2, 4, 1, 3, 5).contiguous()
+        image = image.permute(1, 3, 0, 2, 4).contiguous()
         # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
-        image = image.view(
-            batch_size, ncw * nch, num_channels, height // nch, width // ncw
-        )
+        image = image.view(ncw * nch, num_channels, height // nch, width // ncw)
         return image
 
+    # def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
+    #     # Split image into number of required tiles (width x height)
+    #     batch_size, num_channels, height, width = image.size()
+    #     image = image.view(
+    #         batch_size, num_channels, nch, height // nch, ncw, width // ncw
+    #     )
+    #     # Permute dimensions to reorder the axes
+    #     image = image.permute(0, 2, 4, 1, 3, 5).contiguous()
+    #     # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
+    #     image = image.view(
+    #         batch_size, ncw * nch, num_channels, height // nch, width // ncw
+    #     )
+    #     return image
+
     def _get_image_height_width(
         self, image_width: int, image_height: int, target_width: int, target_height: int
     ) -> Tuple[int, int]:
@@ -438,9 +449,9 @@ def __call__(
         image = self.normalize(image)
         image = self._split(image, ar[0], ar[1])  # type: ignore
         if self.use_thumbnail == "before":
-            image = torch.cat((thumbnail, image), dim=1)
+            image = torch.cat((thumbnail, image), dim=0)
         elif self.use_thumbnail == "after":
-            image = torch.cat((image, thumbnail), dim=1)
+            image = torch.cat((image, thumbnail), dim=0)
         elif self.use_thumbnail == "both":
-            image = torch.cat((thumbnail, image, thumbnail), dim=1)
+            image = torch.cat((thumbnail, image, thumbnail), dim=0)
         return image, ar
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 2654ad19f856..e9bba9248da5 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -373,26 +373,14 @@ def get_image_features(
         Obtains image last hidden states from the vision tower and apply multimodal projection.
 
         Args:
-            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_tiles, channels, height, width)`)
                The tensors corresponding to the input images.
-            vision_feature_layer (`Union[int, List[int]]`):
-                The index of the layer to select the vision feature. If multiple indices are provided,
-                the vision feature of the corresponding indices will be concatenated to form the
-                vision features.
-            vision_feature_select_strategy (`str`):
-                The feature selection strategy used to select the vision feature from the vision backbone.
-                Can be one of `"default"` or `"full"`
         Returns:
-            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_tiles, num_patches, embed_dim)`).
         """
         print("pixel_values shape: ", pixel_values.shape)
-        # torch.save(pixel_values, "/tmp/occhi/0/_1.pt")
-        # pixel_values = (
-        #     torch.load("/tmp/occhi/0/images_dump_1.pt").unsqueeze(0).to(pixel_values)
-        # )
         image_outputs = self.vision_model(pixel_values[0])
         print("image_outputs shape: ", image_outputs.shape)
-        # image_outputs = torch.load("/tmp/occhi/0/h_img_dump_0.pt").to(image_outputs)
         image_features = self.multi_modal_projector(image_outputs)
         return image_features
 
diff --git a/test.py b/test.py
index c26c3694aa1a..e92d57a2b922 100644
--- a/test.py
+++ b/test.py
@@ -20,9 +20,7 @@
     }
 ]
 
-print(model.config)
-
-
+# print(model.config)
 inputs = processor.apply_chat_template(
     conversation,
     add_generation_prompt=True,
@@ -37,6 +35,6 @@
 input_length = inputs["input_ids"].shape[1]
 generate_ids_without_inputs = generate_ids[:, input_length:]
 
-print(generate_ids_without_inputs.cpu().numpy().tolist())
+# print(generate_ids_without_inputs.cpu().numpy().tolist())
 for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
     print(output)
diff --git a/test_video.py b/test_video.py
index 99382ecdbfc1..b02b71ab25bf 100644
--- a/test_video.py
+++ b/test_video.py
@@ -20,9 +20,7 @@
     }
 ]
 
-print(model.config)
-
-
+# print(model.config)
 inputs = processor.apply_chat_template(
     conversation,
     add_generation_prompt=True,
@@ -40,6 +38,6 @@
 input_length = inputs["input_ids"].shape[1]
 generate_ids_without_inputs = generate_ids[:, input_length:]
 
-print(generate_ids_without_inputs.cpu().numpy().tolist())
+# print(generate_ids_without_inputs.cpu().numpy().tolist())
 for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
     print(output)

From e1d5f8f84d7271faeca0164b4ace5c094c61262f Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Mon, 12 May 2025 20:58:21 +0000
Subject: [PATCH 09/65] Minor fixes after rebasing on main.

---
 .../perception_lm/image_processing_perception_lm_fast.py    | 6 +-----
 .../models/perception_lm/processing_perception_lm.py        | 3 ++-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index f06c28198650..b542d7c1ea8b 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -24,8 +24,6 @@
     BatchFeature,
 )
 from ...image_processing_utils_fast import (
-    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
-    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
     DefaultFastImageProcessorKwargs,
     group_images_by_shape,
@@ -36,11 +34,11 @@
     IMAGENET_STANDARD_STD,
     ChannelDimension,
     ImageInput,
-    VideoInput,
     PILImageResampling,
     SizeDict,
     get_image_size,
 )
+from ...video_utils import VideoInput
 from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
@@ -75,7 +73,6 @@ class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
 
 @add_start_docstrings(
     "Constructs a fast PerceptionLM image processor.",
-    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     """
         do_pad (`bool`, *optional*, defaults to `self.do_pad`):
             Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
@@ -106,7 +103,6 @@ def to_dict(self):
         return dictionary
 
     @add_start_docstrings(
-        BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
         """
             do_pad (`bool`, *optional*, defaults to `self.do_pad`):
                 Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 8930e7fbf33a..2787beda338f 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -19,7 +19,8 @@
 from typing import List, Union
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
+from ...video_utils import VideoInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging

From 74db88481d4d302a728ca03b08dc4ef845c9b8e9 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 13 May 2025 01:21:46 +0000
Subject: [PATCH 10/65] Video processor based on new API.

---
 .../convert_perception_lm_weights_to_hf.py    | 12 +++-
 .../image_processing_perception_lm_fast.py    | 28 ++++----
 .../perception_lm/modeling_perception_lm.py   |  6 +-
 .../perception_lm/processing_perception_lm.py | 66 ++++++++++++-------
 .../video_processing_perception_lm.py         | 54 +++++++++++++++
 test.py                                       |  4 +-
 test_video.py                                 |  2 +-
 7 files changed, 129 insertions(+), 43 deletions(-)
 create mode 100644 src/transformers/models/perception_lm/video_processing_perception_lm.py

diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 9874b0eb86ef..b5b038596af2 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -36,6 +36,9 @@
 from transformers.models.perception_lm.image_processing_perception_lm_fast import (
     PerceptionLMImageProcessorFast,
 )
+from transformers.models.perception_lm.video_processing_perception_lm import (
+    PerceptionLMVideoProcessor,
+)
 from transformers.models.perception_lm.modeling_perception_lm import (
     PerceptionLMForConditionalGeneration,
 )
@@ -491,18 +494,25 @@ def write_tokenizer(
         "patch_size": params["model"]["vision_model"]["patch_size"],
         "processor_class": "PerceptionLMProcessor",
     }
+    image_res = params["model"]["vision_model"]["image_size"]
 
     image_preprocessor_config = {
         "image_processor_type": "PerceptionLMImageProcessorFast",
         "vision_input_type": params["data"]["vision_input_type"],
-        "image_res": params["model"]["vision_model"]["image_size"],
+        "image_res": image_res,
         "max_num_tiles": params["data"]["max_num_tiles"],
         "max_frame_tiles": 1,
         "normalize_img": True,
     }
     image_preprocessor = PerceptionLMImageProcessorFast(**image_preprocessor_config)
+    video_preprocessor_config = {
+        "video_processor_type": "PerceptionLMVideoProcessor",
+        "size": {"height": image_res, "width": image_res},
+    }
+    video_preprocessor = PerceptionLMVideoProcessor(**video_preprocessor_config)
     processor = PerceptionLMProcessor(
         image_processor=image_preprocessor,
+        video_processor=video_preprocessor,
         tokenizer=tokenizer,
         **processor_config,
     )
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index b542d7c1ea8b..7825048604d7 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -38,7 +38,7 @@
     SizeDict,
     get_image_size,
 )
-from ...video_utils import VideoInput
+# from ...video_utils import VideoInput
 from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
@@ -89,17 +89,17 @@ def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> No
             max_num_tiles=kwargs.get("max_num_tiles", 36),
             normalize_img=kwargs.get("normalize_img", True),
         )
-        self.video_transform = get_image_transform(
-            vision_input_type="vanilla",
-            image_res=kwargs.get("image_res", 448),
-            max_num_tiles=kwargs.get("max_frame_tiles", 1),
-            normalize_img=kwargs.get("normalize_img", True),
-        )
+        # self.video_transform = get_image_transform(
+        #     vision_input_type="vanilla",
+        #     image_res=kwargs.get("image_res", 448),
+        #     max_num_tiles=kwargs.get("max_frame_tiles", 1),
+        #     normalize_img=kwargs.get("normalize_img", True),
+        # )
 
     def to_dict(self):
         dictionary = super().to_dict()
         dictionary["image_transform"] = self.image_transform.to_dict()
-        dictionary["video_transform"] = self.video_transform.to_dict()
+        # dictionary["video_transform"] = self.video_transform.to_dict()
         return dictionary
 
     @add_start_docstrings(
@@ -108,7 +108,7 @@ def to_dict(self):
                 Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
         """,
     )
-    def preprocess(self, images: ImageInput, videos: VideoInput, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> BatchFeature:
         return_tensors = kwargs.get("return_tensors", "pt")
         if images:
             processed_images = []
@@ -118,11 +118,11 @@ def preprocess(self, images: ImageInput, videos: VideoInput, **kwargs: Unpack[Pe
                 processed_images.append(processed)
             processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
             return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
-        elif videos:
-            videos = [torch.from_numpy(np.array(v)).flatten(0, 1).permute(0, 3, 1, 2) for v in videos]
-            processed_videos = [self.video_transform(v)[0] for v in videos]
-            processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
-            return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)
+        # elif videos:
+        #     videos = [torch.from_numpy(np.array(v)).flatten(0, 1).permute(0, 3, 1, 2) for v in videos]
+        #     processed_videos = [self.video_transform(v)[0] for v in videos]
+        #     processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
+        #     return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)
         else:
             return BatchFeature(data={"pixel_values": None}, tensor_type=return_tensors)
 
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index e9bba9248da5..efc45a8063f3 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -392,6 +392,7 @@ def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -463,6 +464,8 @@ def forward(
             raise ValueError(
                 "You must specify exactly one of input_ids or inputs_embeds"
             )
+        if pixel_values_videos is not None:
+            pixel_values = pixel_values_videos
 
         if pixel_values is not None and inputs_embeds is not None:
             raise ValueError(
@@ -548,6 +551,7 @@ def prepare_inputs_for_generation(
         past_key_values=None,
         inputs_embeds=None,
         pixel_values=None,
+        pixel_values_videos=None,
         attention_mask=None,
         cache_position=None,
         logits_to_keep=None,
@@ -569,7 +573,7 @@ def prepare_inputs_for_generation(
             # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
             # Otherwise we need pixel values to be passed to model
             model_inputs["pixel_values"] = pixel_values
-
+            model_inputs["pixel_values_videos"] = pixel_values_videos
         return model_inputs
 
 
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 2787beda338f..9c8c9ddb9130 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -64,7 +64,7 @@ class PerceptionLMProcessor(ProcessorMixin):
             extra tokens appended, no need to set this arg.
     """
 
-    attributes = ["image_processor", "tokenizer"]
+    attributes = ["video_processor", "image_processor", "tokenizer"]
     valid_kwargs = [
         "chat_template",
         "patch_size",
@@ -73,16 +73,18 @@ class PerceptionLMProcessor(ProcessorMixin):
         "num_additional_image_tokens",
     ]
     image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(
         self,
+        video_processor=None,
         image_processor=None,
         tokenizer=None,
         patch_size=None,
         vision_feature_select_strategy=None,
         chat_template=None,
-        image_token="<|image|>", 
+        media_token="<|image|>", 
         num_additional_image_tokens=0,
         pooling_ratio=2,
         **kwargs,
@@ -90,14 +92,14 @@ def __init__(
         self.patch_size = patch_size
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.media_token = tokenizer.media_token if hasattr(tokenizer, "media_token") else media_token
         self.pooling_ratio = pooling_ratio
-        self.image_token_id = (
-            tokenizer.image_token_id
-            if getattr(tokenizer, "image_token_id", None)
-            else tokenizer.convert_tokens_to_ids(self.image_token)
+        self.media_token_id = (
+            tokenizer.media_token_id
+            if getattr(tokenizer, "media_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.media_token)
         )
-        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
         self,
@@ -149,40 +151,56 @@ def __call__(
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
-        if images is not None or videos is not None:
+        if images is not None:
             print("image_processor class", self.image_processor.__class__)
             images = [] if images is None else images
-            image_inputs = self.image_processor(images=images, videos=videos, **output_kwargs["images_kwargs"])
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
         else:
             image_inputs = {}
 
+        if videos is not None:
+            videos_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
+        else:
+            videos_inputs = {}
+
         if isinstance(text, str):
             text = [text]
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
         # try to expand inputs in processing if we have the necessary parts
-        prompt_strings = text
+        prompt_strings = []
+        pixel_values = None
+        pixel_values_videos = None
+
         if image_inputs.get("pixel_values") is not None:
-            # Replace the image token with the expanded image token sequence
             pixel_values = image_inputs["pixel_values"]
-            print("pixel_values", pixel_values.shape)
-            height, width = get_image_size(to_numpy_array(pixel_values[0]))
-            num_tiles = pixel_values[0].shape[0]
-            num_image_tokens = (height // self.patch_size // self.pooling_ratio) * (
+        if videos_inputs.get("pixel_values_videos") is not None:
+            pixel_values_videos = videos_inputs["pixel_values_videos"]
+        for i, sample in enumerate(text):
+            if pixel_values is not None:
+                media = pixel_values[i]
+            elif pixel_values_videos is not None:
+                media = pixel_values_videos[i]
+            else:
+                continue 
+            
+            # Replace the media token with the expanded media token sequence
+            print("media.shape", media.shape)
+            height, width = get_image_size(to_numpy_array(media))
+            num_tiles = media.shape[0]
+            num_media_tokens = (height // self.patch_size // self.pooling_ratio) * (
                 width // self.patch_size // self.pooling_ratio
             ) * num_tiles
-            print("num_image_tokens", num_image_tokens)
-            print("self.image_token", self.image_token)
-            prompt_strings = []
-            for sample in text:
-                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
-                prompt_strings.append(sample)
+            print("num_media_tokens", num_media_tokens)
+            print("self.media_token", self.media_token)
+            sample = sample.replace(self.media_token, self.media_token * num_media_tokens)
+            prompt_strings.append(sample)
 
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
-        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
-        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["media"])
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/perception_lm/video_processing_perception_lm.py b/src/transformers/models/perception_lm/video_processing_perception_lm.py
new file mode 100644
index 000000000000..58c0d8d1b651
--- /dev/null
+++ b/src/transformers/models/perception_lm/video_processing_perception_lm.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Video processor class for PerceptionLM."""
+
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+)
+from ...processing_utils import Unpack, VideosKwargs
+from ...utils import is_vision_available
+from ...utils.import_utils import requires
+from ...video_processing_utils import (
+    BaseVideoProcessor,
+)
+
+
+if is_vision_available():
+    from ...image_utils import PILImageResampling
+
+
+class PerceptionLMFastVideoProcessorInitKwargs(VideosKwargs): ...
+
+
+@requires(backends=("torchvision",))
+class PerceptionLMVideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BICUBIC
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 448, "width": 448}
+    do_resize = True
+    do_center_crop = False
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    valid_kwargs = PerceptionLMFastVideoProcessorInitKwargs
+    model_input_names = ["pixel_values_videos"]
+
+    def __init__(self, **kwargs: Unpack[PerceptionLMFastVideoProcessorInitKwargs]):
+        super().__init__(**kwargs)
+
+
+__all__ = ["PerceptionLMVideoProcessor"]
diff --git a/test.py b/test.py
index e92d57a2b922..3b5b54a4de85 100644
--- a/test.py
+++ b/test.py
@@ -3,10 +3,10 @@
 from transformers import AutoProcessor
 from transformers import PerceptionLMForConditionalGeneration
 
-processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b", use_fast=True)
+processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_3b", use_fast=True)
 print(type(processor))
 
-model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to(torch.bfloat16).to("cuda")
+model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_3b").to(torch.bfloat16).to("cuda")
 conversation = [
     {
         "role": "user",
diff --git a/test_video.py b/test_video.py
index b02b71ab25bf..16a77d20cfc6 100644
--- a/test_video.py
+++ b/test_video.py
@@ -13,7 +13,7 @@
         "content": [
             {
                 "type": "video",
-                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4",
+                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/tdiuLmVuCKk_000007_000017.mp4",
             },
             {"type": "text", "text": "Can you describe the video in detail?"},
         ],

From fb5ae4bddfaa85f44c211ca46e0123c1f7f41dff Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 13 May 2025 02:07:04 +0000
Subject: [PATCH 11/65] Revert to use _preprocess for image processor.

---
 .../image_processing_perception_lm_fast.py    | 77 ++++---------------
 .../models/perception_lm/image_transform.py   | 36 ++++-----
 test_video.py                                 |  2 +-
 3 files changed, 28 insertions(+), 87 deletions(-)

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 7825048604d7..0ab78dfd8d3e 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -29,16 +29,6 @@
     group_images_by_shape,
     reorder_images,
 )
-from ...image_utils import (
-    IMAGENET_STANDARD_MEAN,
-    IMAGENET_STANDARD_STD,
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    SizeDict,
-    get_image_size,
-)
-# from ...video_utils import VideoInput
 from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
@@ -89,70 +79,31 @@ def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> No
             max_num_tiles=kwargs.get("max_num_tiles", 36),
             normalize_img=kwargs.get("normalize_img", True),
         )
-        # self.video_transform = get_image_transform(
-        #     vision_input_type="vanilla",
-        #     image_res=kwargs.get("image_res", 448),
-        #     max_num_tiles=kwargs.get("max_frame_tiles", 1),
-        #     normalize_img=kwargs.get("normalize_img", True),
-        # )
 
     def to_dict(self):
         dictionary = super().to_dict()
         dictionary["image_transform"] = self.image_transform.to_dict()
-        # dictionary["video_transform"] = self.video_transform.to_dict()
         return dictionary
 
-    @add_start_docstrings(
-        """
-            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
-                Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
-        """,
-    )
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> BatchFeature:
-        return_tensors = kwargs.get("return_tensors", "pt")
+    def _preprocess(
+        self,
+        images: List["torch.Tensor"],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]
+    ) -> BatchFeature:
+        # Group images by size for batched transformation
+        del kwargs
         if images:
-            processed_images = []
-            for image in images:
-                processed = [self.image_transform(im)[0] for im in image]
-                processed = torch.cat(processed, dim=0)
-                processed_images.append(processed)
+            grouped_images, grouped_images_index = group_images_by_shape(images)
+            processed_images_grouped = {}
+            for shape, stacked_images in grouped_images.items():
+                stacked_images, _ = self.image_transform(stacked_images)
+                processed_images_grouped[shape] = stacked_images
+            processed_images = reorder_images(processed_images_grouped, grouped_images_index)
             processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
             return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
-        # elif videos:
-        #     videos = [torch.from_numpy(np.array(v)).flatten(0, 1).permute(0, 3, 1, 2) for v in videos]
-        #     processed_videos = [self.video_transform(v)[0] for v in videos]
-        #     processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
-        #     return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)
         else:
             return BatchFeature(data={"pixel_values": None}, tensor_type=return_tensors)
 
 
-    # def _preprocess(
-    #     self,
-    #     images: List["torch.Tensor"],
-    #     videos: List["torch.Tensor"],
-    #     return_tensors: Optional[Union[str, TensorType]],
-    #     **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]
-    # ) -> BatchFeature:
-    #     # Group images by size for batched transformation
-    #     del kwargs
-    #     if images:
-    #         grouped_images, grouped_images_index = group_images_by_shape(images)
-    #         processed_images_grouped = {}
-    #         for shape, stacked_images in grouped_images.items():
-    #             stacked_images, _ = self.image_transform(stacked_images)
-    #             print("stacked_images shape: ", stacked_images.shape)
-    #             processed_images_grouped[shape] = stacked_images
-    #         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-    #         processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
-    #         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
-    #     elif videos:
-    #         videos = [torch.from_numpy(np.array(v)).flatten(0, 1).permute(0, 3, 1, 2) for v in videos]
-    #         processed_videos = [self.video_transform(v)[0].squeeze(1) for v in videos]
-    #         processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
-    #         return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)
-    #     else:
-    #         return BatchFeature(data={"pixel_values": None}, tensor_type=return_tensors)
-
-
 __all__ = ["PerceptionLMImageProcessorFast"]
diff --git a/src/transformers/models/perception_lm/image_transform.py b/src/transformers/models/perception_lm/image_transform.py
index f6c348c6ee23..c5f0e5dc416d 100644
--- a/src/transformers/models/perception_lm/image_transform.py
+++ b/src/transformers/models/perception_lm/image_transform.py
@@ -123,7 +123,7 @@ def __call__(self, image: Union[Image.Image, torch.Tensor]):
         image = self.normalize(image)
 
         # Add chunk dim to make it compatible with existing dataloaders
-        image = image.view(-1, 3, self.size, self.size)
+        image = image.view(1, -1, 3, self.size, self.size)
         return image, (w, h)
 
 
@@ -273,31 +273,21 @@ def _pad(
             return F.pad(
                 image, (0, 0, new_width - image.shape[-1], new_height - image.shape[-2])
             )
-        
+
     def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
         # Split image into number of required tiles (width x height)
-        num_channels, height, width = image.size()
-        image = image.view(num_channels, nch, height // nch, ncw, width // ncw)
+        batch_size, num_channels, height, width = image.size()
+        image = image.view(
+            batch_size, num_channels, nch, height // nch, ncw, width // ncw
+        )
         # Permute dimensions to reorder the axes
-        image = image.permute(1, 3, 0, 2, 4).contiguous()
+        image = image.permute(0, 2, 4, 1, 3, 5).contiguous()
         # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
-        image = image.view(ncw * nch, num_channels, height // nch, width // ncw)
+        image = image.view(
+            batch_size, ncw * nch, num_channels, height // nch, width // ncw
+        )
         return image
 
-    # def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
-    #     # Split image into number of required tiles (width x height)
-    #     batch_size, num_channels, height, width = image.size()
-    #     image = image.view(
-    #         batch_size, num_channels, nch, height // nch, ncw, width // ncw
-    #     )
-    #     # Permute dimensions to reorder the axes
-    #     image = image.permute(0, 2, 4, 1, 3, 5).contiguous()
-    #     # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
-    #     image = image.view(
-    #         batch_size, ncw * nch, num_channels, height // nch, width // ncw
-    #     )
-    #     return image
-
     def _get_image_height_width(
         self, image_width: int, image_height: int, target_width: int, target_height: int
     ) -> Tuple[int, int]:
@@ -449,9 +439,9 @@ def __call__(
         image = self.normalize(image)
         image = self._split(image, ar[0], ar[1])  # type: ignore
         if self.use_thumbnail == "before":
-            image = torch.cat((thumbnail, image), dim=0)
+            image = torch.cat((thumbnail, image), dim=1)
         elif self.use_thumbnail == "after":
-            image = torch.cat((image, thumbnail), dim=0)
+            image = torch.cat((image, thumbnail), dim=1)
         elif self.use_thumbnail == "both":
-            image = torch.cat((thumbnail, image, thumbnail), dim=0)
+            image = torch.cat((thumbnail, image, thumbnail), dim=1)
         return image, ar
diff --git a/test_video.py b/test_video.py
index 16a77d20cfc6..b02b71ab25bf 100644
--- a/test_video.py
+++ b/test_video.py
@@ -13,7 +13,7 @@
         "content": [
             {
                 "type": "video",
-                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/tdiuLmVuCKk_000007_000017.mp4",
+                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4",
             },
             {"type": "text", "text": "Can you describe the video in detail?"},
         ],

From ee716c3fe72c1576220e408e62af8b6571e71e7d Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 14 May 2025 21:59:33 +0000
Subject: [PATCH 12/65] refactor with modular

---
 .../convert_perception_lm_weights_to_hf.py    |   2 +-
 .../image_processing_perception_lm_fast.py    |  14 -
 .../models/perception_lm/image_transform.py   |   2 -
 .../perception_lm/modeling_perception_lm.py   | 304 +++++----
 .../perception_lm/modular_perception_lm.py    | 639 ++++++++++++++++++
 5 files changed, 802 insertions(+), 159 deletions(-)
 create mode 100644 src/transformers/models/perception_lm/modular_perception_lm.py

diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index b5b038596af2..39705c41e4c5 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -294,7 +294,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         }
         for k, v in loaded.items():
             if "vision_model" in k:
-                state_dict[k] = v
+                state_dict[k.replace("vision_model", "vision_tower")] = v
         for k, v in state_dict.items():
             index_dict["weight_map"][k] = filename
             param_count += v.numel()
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 0ab78dfd8d3e..9c6ab3812adb 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -34,25 +34,11 @@
     TensorType,
     add_start_docstrings,
     is_torch_available,
-    is_torchvision_available,
-    is_torchvision_v2_available,
-    is_vision_available,
 )
 
-
-if is_vision_available():
-    from ...image_utils import PILImageResampling
-
 if is_torch_available():
     import torch
 
-if is_torchvision_available():
-    if is_torchvision_v2_available():
-        from torchvision.transforms.v2 import functional as F
-    else:
-        from torchvision.transforms import functional as F
-
-
 class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     vision_input_type: str = "thumb+tile"
     image_res: int = 448
diff --git a/src/transformers/models/perception_lm/image_transform.py b/src/transformers/models/perception_lm/image_transform.py
index c5f0e5dc416d..17791ef1e5be 100644
--- a/src/transformers/models/perception_lm/image_transform.py
+++ b/src/transformers/models/perception_lm/image_transform.py
@@ -419,12 +419,10 @@ def __call__(
         ar = self._fit_image_to_canvas(
             img_width=w, img_height=h, area_limit=self.area_limit
         )
-        print("orginal w, h", w, h)
         if ar is None:
             # If we did not find a canvas, we have to find the closest aspect ratio and downsample the image
             ar = self._find_closest_aspect_ratio(img_width=w, img_height=h)
 
-        print("closest aspect ratio", ar)
         image = _resize(
             image,
             (ar[1] * self.size, ar[0] * self.size),  # (h, w)
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index efc45a8063f3..6113b4784c2a 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -1,3 +1,9 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/perception_lm/modular_perception_lm.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_perception_lm.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2025 the HuggingFace Inc. team. All rights reserved.
 #
@@ -12,41 +18,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch PerceptionLM model."""
 
+import math
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
-import math
-
 import torch
-import torch.utils.checkpoint
-from torch import nn
 import torch.nn.functional as F
+from timm.models.pe import PE
+from torch import nn
 
-from ...activations import ACT2FN
-from ...generation import GenerationMixin
+from transformers.generation.utils import GenerationMixin
+
+# from ...generation import GenerationMixin
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_torchdynamo_compiling,
-    logging,
-    replace_return_docstrings,
-)
-from timm.models.pe import PE
-from ..auto import AutoModel, AutoModelForCausalLM
-from .configuration_perception_lm import PerceptionLMConfig, PerceptionEncoderConfig
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ..auto import AutoModelForCausalLM
+from .configuration_perception_lm import PerceptionEncoderConfig, PerceptionLMConfig
 
 
-logger = logging.get_logger(__name__)
-
 _CONFIG_FOR_DOC = "PerceptionLMConfig"
 
-# Base docstring
-_CHECKPOINT_FOR_DOC = "facebook/Perception-LM-1B"
-
 
 class PerceptionEncoder(PE):
     def __init__(self, config: PerceptionEncoderConfig):
@@ -119,29 +112,6 @@ class PerceptionLMCausalLMOutputWithPast(ModelOutput):
     image_hidden_states: Optional[torch.FloatTensor] = None
 
 
-# # Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->PerceptionLM
-# class PerceptionLMMultiModalProjector(nn.Module):
-#     def __init__(self, config: PerceptionLMConfig):
-#         super().__init__()
-#         # We have hidden_size * the number of vision feature layers
-#         num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
-#         self.linear_1 = nn.Linear(
-#             config.vision_config.hidden_size * num_feature_layers,
-#             config.text_config.hidden_size,
-#             bias=config.multimodal_projector_bias,
-#         )
-#         self.act = ACT2FN[config.projector_hidden_act]
-#         self.linear_2 = nn.Linear(
-#             config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
-#         )
-
-#     def forward(self, image_features):
-#         hidden_states = self.linear_1(image_features)
-#         hidden_states = self.act(hidden_states)
-#         hidden_states = self.linear_2(hidden_states)
-#         return hidden_states
-
-
 class AdaptiveAvgPooling(nn.Module):
     def __init__(self, pooling_ratio=2):
         super(AdaptiveAvgPooling, self).__init__()
@@ -179,9 +149,7 @@ def __init__(self, config: PerceptionLMConfig):
             ),
         )
         self.pooling = (
-            AdaptiveAvgPooling(config.projector_pooling_ratio)
-            if config.projector_pooling_ratio > 1
-            else nn.Identity()
+            AdaptiveAvgPooling(config.projector_pooling_ratio) if config.projector_pooling_ratio > 1 else nn.Identity()
         )
 
     def forward(self, x):
@@ -213,7 +181,6 @@ def forward(self, x):
     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
     PERCEPTION_LM_START_DOCSTRING,
 )
-# Copied from transformers.models.llava.modeling_llava.LlavaPreTrainedModel with Llava->PerceptionLM,llava->perception_lm
 class PerceptionLMPreTrainedModel(PreTrainedModel):
     config_class = PerceptionLMConfig
     base_model_prefix = "model"
@@ -323,27 +290,14 @@ def _init_weights(self, module):
     """The PERCEPTION_LM model which consists of a vision backbone and a language model.""",
     PERCEPTION_LM_START_DOCSTRING,
 )
-# Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration with LLAVA->PERCEPTION_LM,Llava->PerceptionLM,llava-hf/llava-1.5-7b-hf->facebook/Perception-LM-1B
-class PerceptionLMForConditionalGeneration(
-    PerceptionLMPreTrainedModel, GenerationMixin
-):
+class PerceptionLMModel(PerceptionLMPreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+
     def __init__(self, config: PerceptionLMConfig):
         super().__init__(config)
-        self.vision_model = PerceptionEncoder(config.vision_config)
-
+        self.vision_tower = PerceptionEncoder(config.vision_config)
         self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
-        self.vocab_size = config.text_config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
-
-        if self.language_model._tied_weights_keys is not None:
-            self._tied_weights_keys = [
-                f"language_model.{k}" for k in self.language_model._tied_weights_keys
-            ]
-
-        self.pad_token_id = (
-            self.config.pad_token_id if self.config.pad_token_id is not None else -1
-        )
-
         self.post_init()
 
     def get_input_embeddings(self):
@@ -352,18 +306,6 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.language_model.set_input_embeddings(value)
 
-    def get_output_embeddings(self):
-        return self.language_model.get_output_embeddings()
-
-    def set_output_embeddings(self, new_embeddings):
-        self.language_model.set_output_embeddings(new_embeddings)
-
-    def set_decoder(self, decoder):
-        self.language_model.set_decoder(decoder)
-
-    def get_decoder(self):
-        return self.language_model.get_decoder()
-
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
@@ -379,15 +321,13 @@ def get_image_features(
             image_features (`torch.Tensor`): Image feature tensor of shape `(num_tiles, num_patches, embed_dim)`).
         """
         print("pixel_values shape: ", pixel_values.shape)
-        image_outputs = self.vision_model(pixel_values[0])
+        image_outputs = self.vision_tower(pixel_values[0])
         print("image_outputs shape: ", image_outputs.shape)
         image_features = self.multi_modal_projector(image_outputs)
         return image_features
 
     @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
-    )
+    @replace_return_docstrings(output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -447,23 +387,13 @@ def forward(
         "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
         ```"""
 
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You must specify exactly one of input_ids or inputs_embeds"
-            )
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if pixel_values_videos is not None:
             pixel_values = pixel_values_videos
 
@@ -475,21 +405,16 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
+        image_features = None
         if pixel_values is not None:
             image_features = self.get_image_features(
                 pixel_values=pixel_values.to(inputs_embeds),
             )
 
             special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
-            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
-                inputs_embeds.device
-            )
-            image_features = image_features.to(
-                inputs_embeds.device, inputs_embeds.dtype
-            )
-            inputs_embeds = inputs_embeds.masked_scatter(
-                special_image_mask, image_features
-            )
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -504,46 +429,34 @@ def forward(
             logits_to_keep=logits_to_keep,
             **lm_kwargs,
         )
+        return outputs, image_features
 
-        logits = outputs[0]
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
 
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                # we use the input attention mask to shift the logits and labels, because it is 2D.
-                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
-                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(
-                    logits.device
-                )
-                shift_logits = logits[..., :-1, :][
-                    shift_attention_mask.to(logits.device) != 0
-                ].contiguous()
-                shift_labels = labels[..., 1:][
-                    shift_attention_mask.to(labels.device) != 0
-                ].contiguous()
-            else:
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)),
-                shift_labels.view(-1).to(shift_logits.device),
-            )
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
 
-        return PerceptionLMCausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            image_hidden_states=image_features if pixel_values is not None else None,
-        )
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+
+@add_start_docstrings(
+    """The PERCEPTION_LM model which consists of a vision backbone and a language model.""",
+    PERCEPTION_LM_START_DOCSTRING,
+)
+class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin):
+    def __init__(self, config: PerceptionLMConfig, **super_kwargs):
+        super().__init__(config, **super_kwargs)
+        self.model = PerceptionLMModel(config)
+        if self.model.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [
+                f"model.language_model.{k}" for k in self.model.language_model._tied_weights_keys
+            ]
+        self.post_init()
 
     def prepare_inputs_for_generation(
         self,
@@ -559,7 +472,7 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        model_inputs = self.language_model.prepare_inputs_for_generation(
+        model_inputs = self.model.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
@@ -576,5 +489,112 @@ def prepare_inputs_for_generation(
             model_inputs["pixel_values_videos"] = pixel_values_videos
         return model_inputs
 
+    @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, List[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **lm_kwargs,
+    ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, PerceptionLMForConditionalGeneration
+        >>> model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B")
+        >>> processor = AutoProcessor.from_pretrained("facebook/Perception-LM-1B")
+        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
+        ```"""
+
+        outputs, image_features = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            image_sizes=image_sizes,
+            **lm_kwargs,
+        )
+
+        logits = outputs[0]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1).to(shift_logits.device),
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return PerceptionLMCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
 
 __all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel"]
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
new file mode 100644
index 000000000000..3a2339d967b7
--- /dev/null
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -0,0 +1,639 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch PerceptionLM model."""
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import math
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import torch.nn.functional as F
+
+from transformers.generation.utils import GenerationMixin
+from ..llava.modeling_llava import LLaVaModel, LLaVaForConditionalGeneration
+from transformers.configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+from ...activations import ACT2FN
+# from ...generation import GenerationMixin
+from ...modeling_outputs import ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from timm.models.pe import PE
+from ..auto import AutoModelForCausalLM
+
+from .configuration_perception_lm import PerceptionLMConfig, PerceptionEncoderConfig
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "PerceptionLMConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/Perception-LM-1B"
+
+
+class PerceptionEncoder(PE):
+    def __init__(self, config: PerceptionEncoderConfig):
+        assert config.pool_type == "none"
+        self.use_cls_token = config.use_cls_token
+        # Converting configs to timm PE args
+        super().__init__(
+            img_size=config.image_size,
+            patch_size=config.patch_size,
+            width=config.width,
+            layers=config.layers,
+            heads=config.heads,
+            mlp_ratio=config.mlp_ratio,
+            use_cls_token=config.use_cls_token,
+            use_abs_posemb=config.use_abs_posemb,
+            use_ln_post=config.use_ln_post,
+            ls_init_value=config.ls_init_value,
+            drop_path=config.drop_path,
+            output_dim=config.width,
+            use_attn_pool=False,
+            use_proj=False,
+        )
+
+    def forward(self, x):
+        x = super().forward(x)
+        if self.use_cls_token:
+            return x[:, 1:, :]
+        else:
+            return x
+
+
+@dataclass
+# Copied from transformers.models.llava.modeling_llava.LlavaCausalLMOutputWithPast with Llava->PerceptionLM
+class PerceptionLMCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for PerceptionLM causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+
+class AdaptiveAvgPooling(nn.Module):
+    def __init__(self, pooling_ratio=2):
+        super(AdaptiveAvgPooling, self).__init__()
+        self.pooling_ratio = pooling_ratio
+
+    def forward(self, x):
+        b, num_tokens, c = x.shape
+        h = int(math.sqrt(num_tokens))
+        assert h * h == num_tokens
+
+        shape = (h // self.pooling_ratio, h // self.pooling_ratio)
+        x = x.permute(0, 2, 1).reshape(b, -1, h, h)
+        x = F.adaptive_avg_pool2d(x, shape)
+        x = x.flatten(2).transpose(1, 2)
+
+        return x
+
+
+class PerceptionLMMultiModalProjector(nn.Module):
+    def __init__(self, config: PerceptionLMConfig):
+        super().__init__()
+        input_size = config.vision_config.width
+        output_size = config.text_config.hidden_size
+        self.projector = nn.Sequential(
+            nn.Linear(
+                in_features=input_size,
+                out_features=output_size,
+                bias=True,
+            ),
+            nn.GELU(),
+            nn.Linear(
+                in_features=output_size,
+                out_features=output_size,
+                bias=True,
+            ),
+        )
+        self.pooling = (
+            AdaptiveAvgPooling(config.projector_pooling_ratio)
+            if config.projector_pooling_ratio > 1
+            else nn.Identity()
+        )
+
+    def forward(self, x):
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.projector(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.pooling(x)
+        return x
+
+
+PERCEPTION_LM_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`PerceptionLMConfig`] or [`PerceptionLMVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    PERCEPTION_LM_START_DOCSTRING,
+)
+class PerceptionLMPreTrainedModel(PreTrainedModel):
+    config_class = PerceptionLMConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PerceptionLMVisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        # important: this ported version of PerceptionLM isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+        # https://github.com/haotian-liu/LLaVA/tree/main/perception_lm should serve for that purpose
+        std = getattr(
+            self.config,
+            "initializer_range",
+            self.config.get_text_config().initializer_range,
+        )
+
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+PERCEPTION_LM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`PerceptionLMProcessor`] uses
+            [`CLIPImageProcessor`] for processing images).
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    """The PERCEPTION_LM model which consists of a vision backbone and a language model.""",
+    PERCEPTION_LM_START_DOCSTRING,
+)
+class PerceptionLMModel(LLaVaModel):
+    def __init__(self, config: PerceptionLMConfig):
+        super().__init__(config)
+        self.vision_tower = PerceptionEncoder(config.vision_config)
+        self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_tiles, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_tiles, num_patches, embed_dim)`).
+        """
+        print("pixel_values shape: ", pixel_values.shape)
+        image_outputs = self.vision_tower(pixel_values[0])
+        print("image_outputs shape: ", image_outputs.shape)
+        image_features = self.multi_modal_projector(image_outputs)
+        return image_features
+
+
+    @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, List[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **lm_kwargs,
+    ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, PerceptionLMForConditionalGeneration
+
+        >>> model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B")
+        >>> processor = AutoProcessor.from_pretrained("facebook/Perception-LM-1B")
+
+        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
+        ```"""
+
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+        if pixel_values_videos is not None:
+            pixel_values = pixel_values_videos
+
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        image_features = None
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values.to(inputs_embeds),
+            )
+
+            special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
+                inputs_embeds.device
+            )
+            image_features = image_features.to(
+                inputs_embeds.device, inputs_embeds.dtype
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(
+                special_image_mask, image_features
+            )
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+        return outputs, image_features
+
+
+@add_start_docstrings(
+    """The PERCEPTION_LM model which consists of a vision backbone and a language model.""",
+    PERCEPTION_LM_START_DOCSTRING,
+)
+class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin):
+
+    def __init__(self, config: PerceptionLMConfig, **super_kwargs):
+        super().__init__(config, **super_kwargs)
+        self.model = PerceptionLMModel(config)
+        if self.model.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [
+                f"model.language_model.{k}" for k in self.model.language_model._tied_weights_keys
+            ]
+        self.post_init()
+
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        pixel_values_videos=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = self.model.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["pixel_values_videos"] = pixel_values_videos
+        return model_inputs
+    
+    @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, List[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **lm_kwargs,
+    ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, PerceptionLMForConditionalGeneration
+        >>> model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B")
+        >>> processor = AutoProcessor.from_pretrained("facebook/Perception-LM-1B")
+        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
+        ```"""
+
+        outputs, image_features = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            image_sizes=image_sizes,
+            **lm_kwargs,
+        )
+
+        logits = outputs[0]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(
+                    logits.device
+                )
+                shift_logits = logits[..., :-1, :][
+                    shift_attention_mask.to(logits.device) != 0
+                ].contiguous()
+                shift_labels = labels[..., 1:][
+                    shift_attention_mask.to(labels.device) != 0
+                ].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1).to(shift_logits.device),
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return PerceptionLMCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel"]

From 65e52313321094d014588b1bd76de6f881cdc52a Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 14 May 2025 22:55:02 +0000
Subject: [PATCH 13/65] fix tie_word_embedding

---
 .../perception_lm/modeling_perception_lm.py   | 21 +++----------------
 .../perception_lm/modular_perception_lm.py    | 20 ++----------------
 2 files changed, 5 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 6113b4784c2a..08845cbf93ad 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -194,9 +194,6 @@ class PerceptionLMPreTrainedModel(PreTrainedModel):
     _supports_static_cache = True
 
     def _init_weights(self, module):
-        # important: this ported version of PerceptionLM isn't meant for training from scratch - only
-        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
-        # https://github.com/haotian-liu/LLaVA/tree/main/perception_lm should serve for that purpose
         std = getattr(
             self.config,
             "initializer_range",
@@ -337,16 +334,12 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        vision_feature_layer: Optional[Union[int, List[int]]] = None,
-        vision_feature_select_strategy: Optional[str] = None,
-        labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
-        image_sizes: Optional[torch.Tensor] = None,
         **lm_kwargs,
     ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
@@ -452,12 +445,11 @@ class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, Generati
     def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         super().__init__(config, **super_kwargs)
         self.model = PerceptionLMModel(config)
-        if self.model.language_model._tied_weights_keys is not None:
-            self._tied_weights_keys = [
-                f"model.language_model.{k}" for k in self.model.language_model._tied_weights_keys
-            ]
         self.post_init()
 
+    def get_output_embeddings(self):
+        return self.model.get_output_embeddings()
+
     def prepare_inputs_for_generation(
         self,
         input_ids,
@@ -500,8 +492,6 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        vision_feature_layer: Optional[Union[int, List[int]]] = None,
-        vision_feature_select_strategy: Optional[str] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -509,7 +499,6 @@ def forward(
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
-        image_sizes: Optional[torch.Tensor] = None,
         **lm_kwargs,
     ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
@@ -549,16 +538,12 @@ def forward(
             position_ids=position_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
-            vision_feature_layer=vision_feature_layer,
-            vision_feature_select_strategy=vision_feature_select_strategy,
-            labels=labels,
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
-            image_sizes=image_sizes,
             **lm_kwargs,
         )
 
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 3a2339d967b7..4ce365e6197e 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -209,9 +209,6 @@ class PerceptionLMPreTrainedModel(PreTrainedModel):
     _supports_static_cache = True
 
     def _init_weights(self, module):
-        # important: this ported version of PerceptionLM isn't meant for training from scratch - only
-        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
-        # https://github.com/haotian-liu/LLaVA/tree/main/perception_lm should serve for that purpose
         std = getattr(
             self.config,
             "initializer_range",
@@ -362,16 +359,12 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        vision_feature_layer: Optional[Union[int, List[int]]] = None,
-        vision_feature_select_strategy: Optional[str] = None,
-        labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
-        image_sizes: Optional[torch.Tensor] = None,
         **lm_kwargs,
     ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
@@ -482,12 +475,10 @@ class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, Generati
     def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         super().__init__(config, **super_kwargs)
         self.model = PerceptionLMModel(config)
-        if self.model.language_model._tied_weights_keys is not None:
-            self._tied_weights_keys = [
-                f"model.language_model.{k}" for k in self.model.language_model._tied_weights_keys
-            ]
         self.post_init()
 
+    def get_output_embeddings(self):
+        return self.model.get_output_embeddings()
 
     def prepare_inputs_for_generation(
         self,
@@ -533,8 +524,6 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        vision_feature_layer: Optional[Union[int, List[int]]] = None,
-        vision_feature_select_strategy: Optional[str] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -542,7 +531,6 @@ def forward(
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
-        image_sizes: Optional[torch.Tensor] = None,
         **lm_kwargs,
     ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
@@ -582,16 +570,12 @@ def forward(
             position_ids=position_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
-            vision_feature_layer=vision_feature_layer,
-            vision_feature_select_strategy=vision_feature_select_strategy,
-            labels=labels,
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
-            image_sizes=image_sizes,
             **lm_kwargs,
         )
 

From cdbeeeb4d09b49cce69230872ff68117018330af Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Mon, 19 May 2025 21:19:22 +0000
Subject: [PATCH 14/65] Testing with timm PE

---
 .../convert_perception_lm_weights_to_hf.py    | 38 +++++++++--
 .../perception_lm/modeling_perception_lm.py   | 47 +++++++------
 .../perception_lm/modular_perception_lm.py    | 68 ++++++++++++-------
 test.py                                       |  4 +-
 4 files changed, 103 insertions(+), 54 deletions(-)

diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 39705c41e4c5..dd6ba3c0cfd3 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -40,11 +40,13 @@
     PerceptionLMVideoProcessor,
 )
 from transformers.models.perception_lm.modeling_perception_lm import (
+    PerceptionEncoder,
     PerceptionLMForConditionalGeneration,
 )
 from transformers.models.perception_lm.processing_perception_lm import (
     PerceptionLMProcessor,
 )
+from timm.models.eva import checkpoint_filter_fn
 
 
 try:
@@ -155,6 +157,15 @@ def write_json(text, path):
         json.dump(text, f)
 
 
+def write_weights(state_dict, index_dict, param_count, filename):
+    for k, v in state_dict.items():
+        index_dict["weight_map"][k] = filename
+        param_count += v.numel()
+    torch.save(state_dict, filename)
+    print(f"Saved {filename}")
+    return param_count
+
+
 def write_model(
     model_path,
     input_base_path,
@@ -227,7 +238,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         param_count = 0
         index_dict = {"weight_map": {}}
         for layer_i in range(n_layers):
-            filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
+            filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 2}.bin"
             assert num_shards == 1, "PerceptionLM does not support sharded weights"
             state_dict = {
                 f"language_model.model.layers.{layer_i}.self_attn.q_proj.weight": permute(
@@ -269,7 +280,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             torch.save(state_dict, os.path.join(tmp_model_path, filename))
             print(f"Saved {filename}")
 
-        filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
+        filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 2}.bin"
 
         state_dict = {
             "language_model.model.embed_tokens.weight": loaded["tok_embeddings.weight"],
@@ -292,14 +303,30 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
                 "vision_projector.projector.2.bias"
             ],
         }
-        for k, v in loaded.items():
-            if "vision_model" in k:
-                state_dict[k.replace("vision_model", "vision_tower")] = v
         for k, v in state_dict.items():
             index_dict["weight_map"][k] = filename
             param_count += v.numel()
         torch.save(state_dict, os.path.join(tmp_model_path, filename))
         print(f"Saved {filename}")
+
+        filename = f"pytorch_model-{n_layers + 2}-of-{n_layers + 2}.bin"
+        state_dict = {
+            k.replace("vision_model.", ""): v
+            for k, v in loaded.items()
+            if "vision_model" in k
+        }
+        vision_config = PerceptionEncoderConfig(**model_params["vision_model"])
+        perception_encoder = PerceptionEncoder(vision_config)
+        state_dict = checkpoint_filter_fn(
+            state_dict, perception_encoder.eva_pe
+        )
+        state_dict = { "vision_tower.eva_pe." + k: v for k, v in state_dict.items()}
+        for k, v in state_dict.items():
+            index_dict["weight_map"][k] = filename
+            param_count += v.numel()
+        torch.save(state_dict, os.path.join(tmp_model_path, filename))
+        print(f"Saved {filename}")
+
         # Write configs
         index_dict["metadata"] = {"total_size": param_count * 2}
         write_json(
@@ -350,7 +377,6 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             tie_word_embeddings=tie_word_embeddings,
         )
 
-        vision_config = PerceptionEncoderConfig(**model_params["vision_model"])
         config = PerceptionLMConfig(
             text_config=text_config.to_dict(),
             vision_config=vision_config.to_dict(),
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 08845cbf93ad..17a038edd551 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -25,7 +25,7 @@
 
 import torch
 import torch.nn.functional as F
-from timm.models.pe import PE
+from timm.models.eva import vit_pe_core_gigantic_patch14_448, vit_pe_core_large_patch14_336
 from torch import nn
 
 from transformers.generation.utils import GenerationMixin
@@ -41,35 +41,40 @@
 _CONFIG_FOR_DOC = "PerceptionLMConfig"
 
 
-class PerceptionEncoder(PE):
+class PerceptionEncoder(nn.Module):
     def __init__(self, config: PerceptionEncoderConfig):
+        super().__init__()
         assert config.pool_type == "none"
         self.use_cls_token = config.use_cls_token
-        # Converting configs to timm PE args
-        super().__init__(
-            img_size=config.image_size,
-            patch_size=config.patch_size,
-            width=config.width,
-            layers=config.layers,
-            heads=config.heads,
-            mlp_ratio=config.mlp_ratio,
-            use_cls_token=config.use_cls_token,
-            use_abs_posemb=config.use_abs_posemb,
-            use_ln_post=config.use_ln_post,
-            ls_init_value=config.ls_init_value,
-            drop_path=config.drop_path,
-            output_dim=config.width,
-            use_attn_pool=False,
-            use_proj=False,
-        )
+        kwargs = {
+            "img_size": (config.image_size, config.image_size),
+            "depth": config.layers,
+            "num_classes": 0,
+            "global_pool": "",
+            "use_post_transformer_norm": config.use_ln_post,
+        }
+        if config.layers == 23 and config.width == 1024:
+            self.eva_pe = vit_pe_core_large_patch14_336(
+                **kwargs,
+            )
+        elif config.layers == 47 and config.width == 1536:
+            self.eva_pe = vit_pe_core_gigantic_patch14_448(
+                **kwargs,
+            )
+        else:
+            raise ValueError(f"Unsupported PE config: {config.layers} layers and {config.width} width")
+        self.eva_pe._initialize_weights = lambda x: x  # disable weight initialization
 
     def forward(self, x):
-        x = super().forward(x)
+        x = self.eva_pe(x)
         if self.use_cls_token:
             return x[:, 1:, :]
         else:
             return x
 
+    def _initialize_weights(self):
+        pass
+
 
 @dataclass
 # Copied from transformers.models.llava.modeling_llava.LlavaCausalLMOutputWithPast with Llava->PerceptionLM
@@ -292,9 +297,9 @@ class PerceptionLMModel(PerceptionLMPreTrainedModel):
 
     def __init__(self, config: PerceptionLMConfig):
         super().__init__(config)
-        self.vision_tower = PerceptionEncoder(config.vision_config)
         self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.vision_tower = PerceptionEncoder(config.vision_config)
         self.post_init()
 
     def get_input_embeddings(self):
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 4ce365e6197e..4d6805321c58 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -31,6 +31,7 @@
 
 
 from ...activations import ACT2FN
+
 # from ...generation import GenerationMixin
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import PreTrainedModel
@@ -40,10 +41,14 @@
     logging,
     replace_return_docstrings,
 )
-from timm.models.pe import PE
+from timm.models.eva import (
+    vit_pe_core_large_patch14_336,
+    vit_pe_core_gigantic_patch14_448,
+)
+
 from ..auto import AutoModelForCausalLM
 
-from .configuration_perception_lm import PerceptionLMConfig, PerceptionEncoderConfig
+from .configuration_perception_lm import PerceptionEncoderConfig, PerceptionLMConfig
 
 logger = logging.get_logger(__name__)
 
@@ -53,34 +58,44 @@
 _CHECKPOINT_FOR_DOC = "facebook/Perception-LM-1B"
 
 
-class PerceptionEncoder(PE):
+class PerceptionEncoder(nn.Module):
     def __init__(self, config: PerceptionEncoderConfig):
+        super().__init__()
         assert config.pool_type == "none"
         self.use_cls_token = config.use_cls_token
-        # Converting configs to timm PE args
-        super().__init__(
-            img_size=config.image_size,
-            patch_size=config.patch_size,
-            width=config.width,
-            layers=config.layers,
-            heads=config.heads,
-            mlp_ratio=config.mlp_ratio,
-            use_cls_token=config.use_cls_token,
-            use_abs_posemb=config.use_abs_posemb,
-            use_ln_post=config.use_ln_post,
-            ls_init_value=config.ls_init_value,
-            drop_path=config.drop_path,
-            output_dim=config.width,
-            use_attn_pool=False,
-            use_proj=False,
-        )
+        kwargs = {
+            "img_size": (config.image_size, config.image_size),
+            "depth": config.layers,
+            "num_classes": 0,
+            "global_pool": "",
+            "use_post_transformer_norm": config.use_ln_post,
+            "init_values": config.ls_init_value,
+        }
+        if config.layers == 23 and config.width == 1024:
+            self.eva_pe = vit_pe_core_large_patch14_336(
+                **kwargs,
+            )
+        elif config.layers == 47 and config.width == 1536:
+            self.eva_pe = vit_pe_core_gigantic_patch14_448(
+                **kwargs,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported PE config: {config.layers} layers and {config.width} width"
+            )
+        self.eva_pe._initialize_weights = (
+            lambda x: x
+        )  # disable weight initialization
 
     def forward(self, x):
-        x = super().forward(x)
+        x = self.eva_pe(x)
         if self.use_cls_token:
             return x[:, 1:, :]
         else:
             return x
+        
+    def _initialize_weights(self):
+        pass
 
 
 @dataclass
@@ -124,7 +139,6 @@ class PerceptionLMCausalLMOutputWithPast(ModelOutput):
     image_hidden_states: Optional[torch.FloatTensor] = None
 
 
-
 class AdaptiveAvgPooling(nn.Module):
     def __init__(self, pooling_ratio=2):
         super(AdaptiveAvgPooling, self).__init__()
@@ -296,6 +310,8 @@ def _init_weights(self, module):
             this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
             the complete sequence length.
 """
+
+
 @add_start_docstrings(
     """The PERCEPTION_LM model which consists of a vision backbone and a language model.""",
     PERCEPTION_LM_START_DOCSTRING,
@@ -303,6 +319,7 @@ def _init_weights(self, module):
 class PerceptionLMModel(LLaVaModel):
     def __init__(self, config: PerceptionLMConfig):
         super().__init__(config)
+        del self.vision_tower
         self.vision_tower = PerceptionEncoder(config.vision_config)
         self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
@@ -345,7 +362,6 @@ def get_image_features(
         image_features = self.multi_modal_projector(image_outputs)
         return image_features
 
-
     @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
     @replace_return_docstrings(
         output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
@@ -470,7 +486,9 @@ def forward(
     """The PERCEPTION_LM model which consists of a vision backbone and a language model.""",
     PERCEPTION_LM_START_DOCSTRING,
 )
-class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin):
+class PerceptionLMForConditionalGeneration(
+    PerceptionLMPreTrainedModel, GenerationMixin
+):
 
     def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         super().__init__(config, **super_kwargs)
@@ -510,7 +528,7 @@ def prepare_inputs_for_generation(
             model_inputs["pixel_values"] = pixel_values
             model_inputs["pixel_values_videos"] = pixel_values_videos
         return model_inputs
-    
+
     @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
     @replace_return_docstrings(
         output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
diff --git a/test.py b/test.py
index 3b5b54a4de85..acb370951d6a 100644
--- a/test.py
+++ b/test.py
@@ -3,10 +3,10 @@
 from transformers import AutoProcessor
 from transformers import PerceptionLMForConditionalGeneration
 
-processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_3b", use_fast=True)
+processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_8b", use_fast=True)
 print(type(processor))
 
-model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_3b").to(torch.bfloat16).to("cuda")
+model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_8b").to(torch.bfloat16).to("cuda")
 conversation = [
     {
         "role": "user",

From 303ddff314e7681432ded70f9515df07fc578b2e Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Mon, 19 May 2025 22:11:55 +0000
Subject: [PATCH 15/65] check in missed converstion from modular to model.py

---
 src/transformers/models/perception_lm/modeling_perception_lm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 17a038edd551..f4bbede3ab8d 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -52,6 +52,7 @@ def __init__(self, config: PerceptionEncoderConfig):
             "num_classes": 0,
             "global_pool": "",
             "use_post_transformer_norm": config.use_ln_post,
+            "init_values": config.ls_init_value,
         }
         if config.layers == 23 and config.width == 1024:
             self.eva_pe = vit_pe_core_large_patch14_336(

From 742c8e121a317852a36a936fd2dfc36518e1b8a8 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Mon, 19 May 2025 22:51:52 +0000
Subject: [PATCH 16/65] First working version of PLM with Eva PE. PLM-1B and 3B
 outputs are exactly the same as before. PLM-8B output has some differences.

---
 .../models/perception_lm/modeling_perception_lm.py     |  4 ++++
 .../models/perception_lm/modular_perception_lm.py      | 10 ++++++----
 test_video.py                                          |  4 ++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index f4bbede3ab8d..5c17daa07ed5 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -53,6 +53,10 @@ def __init__(self, config: PerceptionEncoderConfig):
             "global_pool": "",
             "use_post_transformer_norm": config.use_ln_post,
             "init_values": config.ls_init_value,
+            "ref_feat_shape": (
+                config.image_size // config.patch_size,
+                config.image_size // config.patch_size,
+            ),
         }
         if config.layers == 23 and config.width == 1024:
             self.eva_pe = vit_pe_core_large_patch14_336(
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 4d6805321c58..2e4cc74e8850 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -70,6 +70,10 @@ def __init__(self, config: PerceptionEncoderConfig):
             "global_pool": "",
             "use_post_transformer_norm": config.use_ln_post,
             "init_values": config.ls_init_value,
+            "ref_feat_shape": (
+                config.image_size // config.patch_size,
+                config.image_size // config.patch_size,
+            ),
         }
         if config.layers == 23 and config.width == 1024:
             self.eva_pe = vit_pe_core_large_patch14_336(
@@ -83,9 +87,7 @@ def __init__(self, config: PerceptionEncoderConfig):
             raise ValueError(
                 f"Unsupported PE config: {config.layers} layers and {config.width} width"
             )
-        self.eva_pe._initialize_weights = (
-            lambda x: x
-        )  # disable weight initialization
+        self.eva_pe._initialize_weights = lambda x: x  # disable weight initialization
 
     def forward(self, x):
         x = self.eva_pe(x)
@@ -93,7 +95,7 @@ def forward(self, x):
             return x[:, 1:, :]
         else:
             return x
-        
+
     def _initialize_weights(self):
         pass
 
diff --git a/test_video.py b/test_video.py
index b02b71ab25bf..bc3bc04e895f 100644
--- a/test_video.py
+++ b/test_video.py
@@ -3,10 +3,10 @@
 from transformers import AutoProcessor
 from transformers import PerceptionLMForConditionalGeneration
 
-processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_3b")
+processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_8b")
 print(type(processor))
 
-model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_3b").to(torch.bfloat16).to("cuda")
+model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_8b").to(torch.bfloat16).to("cuda")
 conversation = [
     {
         "role": "user",

From 63be4c6303bf0fe145795261236960a34c690aad Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 20 May 2025 23:55:35 +0000
Subject: [PATCH 17/65] address review comments

---
 .../convert_perception_lm_weights_to_hf.py    |   2 +-
 .../models/perception_lm/image_transform.py   |   1 +
 .../perception_lm/modeling_perception_lm.py   | 253 +++++-------------
 .../perception_lm/modular_perception_lm.py    | 249 +++--------------
 test.py                                       |   4 +-
 5 files changed, 105 insertions(+), 404 deletions(-)

diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index dd6ba3c0cfd3..804df91fe289 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -405,7 +405,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
 
         # Avoid saving this as part of the config.
         del model.config._name_or_path
-        model.config.torch_dtype = torch.float16
+        model.config.torch_dtype = torch.bfloat16
 
         print("Saving in the Transformers format.")
         if push_to_hub:
diff --git a/src/transformers/models/perception_lm/image_transform.py b/src/transformers/models/perception_lm/image_transform.py
index 17791ef1e5be..3ec7430cd907 100644
--- a/src/transformers/models/perception_lm/image_transform.py
+++ b/src/transformers/models/perception_lm/image_transform.py
@@ -426,6 +426,7 @@ def __call__(
         image = _resize(
             image,
             (ar[1] * self.size, ar[0] * self.size),  # (h, w)
+            use_pil_resize=True,
         )
         image = self._pad(image, ar[0] * self.size, ar[1] * self.size)
 
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 5c17daa07ed5..0210c806a19a 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -33,14 +33,11 @@
 # from ...generation import GenerationMixin
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ...utils import auto_docstring, can_return_tuple
 from ..auto import AutoModelForCausalLM
 from .configuration_perception_lm import PerceptionEncoderConfig, PerceptionLMConfig
 
 
-_CONFIG_FOR_DOC = "PerceptionLMConfig"
-
-
 class PerceptionEncoder(nn.Module):
     def __init__(self, config: PerceptionEncoderConfig):
         super().__init__()
@@ -81,47 +78,6 @@ def _initialize_weights(self):
         pass
 
 
-@dataclass
-# Copied from transformers.models.llava.modeling_llava.LlavaCausalLMOutputWithPast with Llava->PerceptionLM
-class PerceptionLMCausalLMOutputWithPast(ModelOutput):
-    """
-    Base class for PerceptionLM causal language model (or autoregressive) outputs.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        image_hidden_states (`torch.FloatTensor`, *optional*):
-            A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
-            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: Optional[torch.FloatTensor] = None
-    past_key_values: Optional[List[torch.FloatTensor]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    image_hidden_states: Optional[torch.FloatTensor] = None
-
-
 class AdaptiveAvgPooling(nn.Module):
     def __init__(self, pooling_ratio=2):
         super(AdaptiveAvgPooling, self).__init__()
@@ -145,158 +101,103 @@ def __init__(self, config: PerceptionLMConfig):
         super().__init__()
         input_size = config.vision_config.width
         output_size = config.text_config.hidden_size
-        self.projector = nn.Sequential(
-            nn.Linear(
-                in_features=input_size,
-                out_features=output_size,
-                bias=True,
-            ),
-            nn.GELU(),
-            nn.Linear(
-                in_features=output_size,
-                out_features=output_size,
-                bias=True,
-            ),
+        self.projector = nn.ModuleList(
+            [
+                nn.Linear(
+                    in_features=input_size,
+                    out_features=output_size,
+                    bias=True,
+                ),
+                nn.GELU(),
+                nn.Linear(
+                    in_features=output_size,
+                    out_features=output_size,
+                    bias=True,
+                ),
+            ]
         )
         self.pooling = (
             AdaptiveAvgPooling(config.projector_pooling_ratio) if config.projector_pooling_ratio > 1 else nn.Identity()
         )
 
-    def forward(self, x):
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.projector(x)
-        x = x.permute(1, 0, 2)  # LND -> NLD
-        x = self.pooling(x)
-        return x
+    def forward(self, features):
+        features = features.permute(1, 0, 2)  # NLD -> LND
+        for layer in self.projector:
+            features = layer(features)
+        features = features.permute(1, 0, 2)  # LND -> NLD
+        features = self.pooling(features)
+        return features
 
 
-PERCEPTION_LM_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`PerceptionLMConfig`] or [`PerceptionLMVisionConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    PERCEPTION_LM_START_DOCSTRING,
-)
+@auto_docstring
 class PerceptionLMPreTrainedModel(PreTrainedModel):
     config_class = PerceptionLMConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["PerceptionLMVisionAttention"]
     _skip_keys_device_placement = "past_key_values"
     _supports_cache_class = True
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_quantized_cache = True
     _supports_static_cache = True
+    _supports_attention_backend = True
 
     def _init_weights(self, module):
-        std = getattr(
-            self.config,
-            "initializer_range",
-            self.config.get_text_config().initializer_range,
-        )
+        # important: this ported version of PerceptionLM isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+        # https://github.com/haotian-liu/PerceptionLM/tree/main/perception_lm should serve for that purpose
+        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)
 
         if isinstance(module, nn.Linear):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
 
 
-PERCEPTION_LM_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
+@dataclass
+class PerceptionLMCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for PerceptionLM causal language model (or autoregressive) outputs.
 
-            [What are input IDs?](../glossary#input-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
-            The tensors corresponding to the input images. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`PerceptionLMProcessor`] uses
-            [`CLIPImageProcessor`] for processing images).
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
 
-            [What are attention masks?](../glossary#attention-mask)
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
 
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
 
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
 
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`):
-            The index of the layer to select the vision feature. If multiple indices are provided,
-            the vision feature of the corresponding indices will be concatenated to form the
-            vision features.
-        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Can be one of `"default"` or `"full"`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            the complete sequence length.
-"""
-
-
-@add_start_docstrings(
-    """The PERCEPTION_LM model which consists of a vision backbone and a language model.""",
-    PERCEPTION_LM_START_DOCSTRING,
-)
+@auto_docstring
 class PerceptionLMModel(PerceptionLMPreTrainedModel):
     _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
 
@@ -327,14 +228,12 @@ def get_image_features(
         Returns:
             image_features (`torch.Tensor`): Image feature tensor of shape `(num_tiles, num_patches, embed_dim)`).
         """
-        print("pixel_values shape: ", pixel_values.shape)
-        image_outputs = self.vision_tower(pixel_values[0])
-        print("image_outputs shape: ", image_outputs.shape)
+        image_outputs = self.vision_tower(pixel_values.flatten(0, 1))
         image_features = self.multi_modal_projector(image_outputs)
         return image_features
 
-    @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    @can_return_tuple
+    @auto_docstring
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -437,20 +336,8 @@ def forward(
     def get_output_embeddings(self):
         return self.language_model.get_output_embeddings()
 
-    def set_output_embeddings(self, new_embeddings):
-        self.language_model.set_output_embeddings(new_embeddings)
-
-    def set_decoder(self, decoder):
-        self.language_model.set_decoder(decoder)
-
-    def get_decoder(self):
-        return self.language_model.get_decoder()
-
 
-@add_start_docstrings(
-    """The PERCEPTION_LM model which consists of a vision backbone and a language model.""",
-    PERCEPTION_LM_START_DOCSTRING,
-)
+@auto_docstring
 class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin):
     def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         super().__init__(config, **super_kwargs)
@@ -491,8 +378,6 @@ def prepare_inputs_for_generation(
             model_inputs["pixel_values_videos"] = pixel_values_videos
         return model_inputs
 
-    @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 2e4cc74e8850..084a9c818e65 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -25,7 +25,11 @@
 import torch.nn.functional as F
 
 from transformers.generation.utils import GenerationMixin
-from ..llava.modeling_llava import LLaVaModel, LLaVaForConditionalGeneration
+from ..llava.modeling_llava import (
+    LlavaModel,
+    LlavaPreTrainedModel,
+    LlavaCausalLMOutputWithPast as PerceptionLMCausalLMOutputWithPast,
+)
 from transformers.configuration_utils import PretrainedConfig
 from ..auto import CONFIG_MAPPING, AutoConfig
 
@@ -39,7 +43,7 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
-    replace_return_docstrings,
+    auto_docstring,
 )
 from timm.models.eva import (
     vit_pe_core_large_patch14_336,
@@ -100,47 +104,6 @@ def _initialize_weights(self):
         pass
 
 
-@dataclass
-# Copied from transformers.models.llava.modeling_llava.LlavaCausalLMOutputWithPast with Llava->PerceptionLM
-class PerceptionLMCausalLMOutputWithPast(ModelOutput):
-    """
-    Base class for PerceptionLM causal language model (or autoregressive) outputs.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        image_hidden_states (`torch.FloatTensor`, *optional*):
-            A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
-            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: Optional[torch.FloatTensor] = None
-    past_key_values: Optional[List[torch.FloatTensor]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    image_hidden_states: Optional[torch.FloatTensor] = None
-
-
 class AdaptiveAvgPooling(nn.Module):
     def __init__(self, pooling_ratio=2):
         super(AdaptiveAvgPooling, self).__init__()
@@ -164,18 +127,20 @@ def __init__(self, config: PerceptionLMConfig):
         super().__init__()
         input_size = config.vision_config.width
         output_size = config.text_config.hidden_size
-        self.projector = nn.Sequential(
-            nn.Linear(
-                in_features=input_size,
-                out_features=output_size,
-                bias=True,
-            ),
-            nn.GELU(),
-            nn.Linear(
-                in_features=output_size,
-                out_features=output_size,
-                bias=True,
-            ),
+        self.projector = nn.ModuleList(
+            [
+                nn.Linear(
+                    in_features=input_size,
+                    out_features=output_size,
+                    bias=True,
+                ),
+                nn.GELU(),
+                nn.Linear(
+                    in_features=output_size,
+                    out_features=output_size,
+                    bias=True,
+                ),
+            ]
         )
         self.pooling = (
             AdaptiveAvgPooling(config.projector_pooling_ratio)
@@ -183,142 +148,20 @@ def __init__(self, config: PerceptionLMConfig):
             else nn.Identity()
         )
 
-    def forward(self, x):
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.projector(x)
-        x = x.permute(1, 0, 2)  # LND -> NLD
-        x = self.pooling(x)
-        return x
-
-
-PERCEPTION_LM_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
+    def forward(self, features):
+        features = features.permute(1, 0, 2)  # NLD -> LND
+        for layer in self.projector:
+            features = layer(features)
+        features = features.permute(1, 0, 2)  # LND -> NLD
+        features = self.pooling(features)
+        return features
 
-    Parameters:
-        config ([`PerceptionLMConfig`] or [`PerceptionLMVisionConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
 
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    PERCEPTION_LM_START_DOCSTRING,
-)
-class PerceptionLMPreTrainedModel(PreTrainedModel):
-    config_class = PerceptionLMConfig
+class PerceptionLMPreTrainedModel(LlavaPreTrainedModel):
     base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["PerceptionLMVisionAttention"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_cache_class = True
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_quantized_cache = True
-    _supports_static_cache = True
-
-    def _init_weights(self, module):
-        std = getattr(
-            self.config,
-            "initializer_range",
-            self.config.get_text_config().initializer_range,
-        )
 
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-
-
-PERCEPTION_LM_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
-            The tensors corresponding to the input images. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`PerceptionLMProcessor`] uses
-            [`CLIPImageProcessor`] for processing images).
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`):
-            The index of the layer to select the vision feature. If multiple indices are provided,
-            the vision feature of the corresponding indices will be concatenated to form the
-            vision features.
-        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Can be one of `"default"` or `"full"`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            the complete sequence length.
-"""
-
-
-@add_start_docstrings(
-    """The PERCEPTION_LM model which consists of a vision backbone and a language model.""",
-    PERCEPTION_LM_START_DOCSTRING,
-)
-class PerceptionLMModel(LLaVaModel):
+@auto_docstring
+class PerceptionLMModel(LlavaModel):
     def __init__(self, config: PerceptionLMConfig):
         super().__init__(config)
         del self.vision_tower
@@ -326,24 +169,9 @@ def __init__(self, config: PerceptionLMConfig):
         self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
 
-    def get_input_embeddings(self):
-        return self.language_model.get_input_embeddings()
-
-    def set_input_embeddings(self, value):
-        self.language_model.set_input_embeddings(value)
-
     def get_output_embeddings(self):
         return self.language_model.get_output_embeddings()
 
-    def set_output_embeddings(self, new_embeddings):
-        self.language_model.set_output_embeddings(new_embeddings)
-
-    def set_decoder(self, decoder):
-        self.language_model.set_decoder(decoder)
-
-    def get_decoder(self):
-        return self.language_model.get_decoder()
-
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
@@ -358,16 +186,10 @@ def get_image_features(
         Returns:
             image_features (`torch.Tensor`): Image feature tensor of shape `(num_tiles, num_patches, embed_dim)`).
         """
-        print("pixel_values shape: ", pixel_values.shape)
-        image_outputs = self.vision_tower(pixel_values[0])
-        print("image_outputs shape: ", image_outputs.shape)
+        image_outputs = self.vision_tower(pixel_values.flatten(0, 1))
         image_features = self.multi_modal_projector(image_outputs)
         return image_features
 
-    @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
-    )
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -484,10 +306,7 @@ def forward(
         return outputs, image_features
 
 
-@add_start_docstrings(
-    """The PERCEPTION_LM model which consists of a vision backbone and a language model.""",
-    PERCEPTION_LM_START_DOCSTRING,
-)
+@auto_docstring
 class PerceptionLMForConditionalGeneration(
     PerceptionLMPreTrainedModel, GenerationMixin
 ):
@@ -531,10 +350,6 @@ def prepare_inputs_for_generation(
             model_inputs["pixel_values_videos"] = pixel_values_videos
         return model_inputs
 
-    @add_start_docstrings_to_model_forward(PERCEPTION_LM_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=PerceptionLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
-    )
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/test.py b/test.py
index acb370951d6a..e92d57a2b922 100644
--- a/test.py
+++ b/test.py
@@ -3,10 +3,10 @@
 from transformers import AutoProcessor
 from transformers import PerceptionLMForConditionalGeneration
 
-processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_8b", use_fast=True)
+processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b", use_fast=True)
 print(type(processor))
 
-model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_8b").to(torch.bfloat16).to("cuda")
+model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to(torch.bfloat16).to("cuda")
 conversation = [
     {
         "role": "user",

From 139f829897e35380b8add144d661d859070358b2 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Thu, 22 May 2025 02:47:01 +0000
Subject: [PATCH 18/65] Fixed batching if video and image examples mixed.

---
 .../configuration_perception_lm.py            |  9 ++-
 .../convert_perception_lm_weights_to_hf.py    | 71 ++++++++++---------
 .../perception_lm/modeling_perception_lm.py   | 15 ++--
 .../perception_lm/modular_perception_lm.py    | 22 ++++--
 .../perception_lm/processing_perception_lm.py | 68 +++++++++---------
 test.py                                       | 21 +++++-
 test_video.py                                 |  4 +-
 7 files changed, 120 insertions(+), 90 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index f90db925a6e8..73b47f223559 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -88,9 +88,6 @@ class PerceptionLMConfig(PretrainedConfig):
     ```"""
 
     model_type = "perception_lm"
-    attribute_map = {
-        "image_token_id": "image_token_index",
-    }
     sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 
     def __init__(
@@ -98,10 +95,12 @@ def __init__(
         vision_config=None,
         text_config=None,
         projector_pooling_ratio=1,
-        image_token_index=128002,
+        image_token_id=128002,
+        video_token_id=128003,
         **kwargs,
     ):
-        self.image_token_index = image_token_index
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
         if isinstance(vision_config, dict):
             vision_config = PerceptionEncoderConfig(**vision_config)
         elif isinstance(vision_config, PerceptionEncoderConfig):
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 804df91fe289..db424ea8e22e 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -129,7 +129,7 @@
         "<|begin_of_text|>",
         "<|end_of_text|>",
         "<|image|>",
-        "<|reserved_special_token_1|>",
+        "<|video|>",
         "<|reserved_special_token_2|>",
         "<|reserved_special_token_3|>",
         "<|start_header_id|>",
@@ -140,6 +140,35 @@
     + [f"<|reserved_special_token_{i}|>" for i in range(5, 256 - 5)]
 }
 
+CHAT_TEMPLATE = (
+    "{{- bos_token }}"
+    "{%- if messages[0]['role'] == 'system' -%}"
+    "    {%- set system_message = messages[0]['content']|trim %}\n"
+    "    {%- set messages = messages[1:] %}\n"
+    "{%- else %}"
+    "    {%- set system_message = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.' %}"
+    "{%- endif %}"
+    "{{- '<|start_header_id|>system<|end_header_id|>\\n\\n' }}"
+    "{{- system_message }}"
+    "{{- '<|eot_id|>' }}"
+    "{%- for message in messages %}"
+    "{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' }}"
+    "{%- for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
+    "{{ '<|image|>' }}"
+    "{%- endfor %}"
+    "{%- for content in message['content'] | selectattr('type', 'equalto', 'video') %}"
+    "{{ '<|video|>' }}"
+    "{%- endfor %}"
+    "{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
+    "{{- content['text'] | trim }}"
+    "{%- endfor %}"
+    "{{'<|eot_id|>' }}"
+    "{%- endfor %}"
+    "{%- if add_generation_prompt %}"
+    "{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}"
+    "{%- endif %}"
+)
+
 
 def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
     return multiple_of * (
@@ -381,7 +410,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             text_config=text_config.to_dict(),
             vision_config=vision_config.to_dict(),
             projector_pooling_ratio=projector_pooling_ratio,
-            image_token_index=image_token_id,
+            image_token_id=image_token_id,
         )
 
         config.save_pretrained(tmp_model_path)
@@ -432,33 +461,6 @@ def __init__(
         super().__init__(vocab_file, additional_special_tokens=special_tokens, **kwargs)
         tokenizer = self.converted()
 
-        chat_template = (
-            "{{- bos_token }}"
-            "{%- if messages[0]['role'] == 'system' -%}"
-            "    {%- set system_message = messages[0]['content']|trim %}\n"
-            "    {%- set messages = messages[1:] %}\n"
-            "{%- else %}"
-            "    {%- set system_message = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.' %}"
-            "{%- endif %}"
-            "{{- '<|start_header_id|>system<|end_header_id|>\\n\\n' }}"
-            "{{- system_message }}"
-            "{{- '<|eot_id|>' }}"
-            "{%- for message in messages %}"
-            "{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' }}"
-            "{%- for content in message['content'] | selectattr('type', 'in', ['image', 'video']) %}"
-            "{{ '<|image|>' }}"
-            "{%- endfor %}"
-            "{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
-            "{{- content['text'] | trim }}"
-            "{%- endfor %}"
-            "{{'<|eot_id|>' }}"
-            "{%- endfor %}"
-            "{%- if add_generation_prompt %}"
-            "{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}"
-            "{%- endif %}"
-        )
-        additional_kwargs = {"chat_template": chat_template}
-
         self.converted_tokenizer = PreTrainedTokenizerFast(
             tokenizer_object=tokenizer,
             bos_token="<|begin_of_text|>",
@@ -466,8 +468,10 @@ def __init__(
             model_input_names=["input_ids", "attention_mask"],
             model_max_length=context_length,
             clean_up_tokenization_spaces=True,
-            **additional_kwargs,
+            extra_special_tokens={"image_token": "<|image|>", "video_token": "<|video|>", "pad_token": "<|end_of_text|>"},
         )
+        self.converted_tokenizer.image_token_id = self.converted_tokenizer.encode(self.converted_tokenizer.image_token, add_special_tokens=False)[0]
+        self.converted_tokenizer.video_token_id = self.converted_tokenizer.encode(self.converted_tokenizer.video_token, add_special_tokens=False)[0]
         self.update_post_processor(self.converted_tokenizer)
         # finer special_tokens_map.json
         self.converted_tokenizer._bos_token = BOS_ADDED_TOKEN
@@ -509,13 +513,9 @@ def write_tokenizer(
         special_tokens,
         context_length,
     ).converted_tokenizer
-    tokenizer.image_token = "<|image|>"
-    tokenizer.image_token_id = tokenizer.encode("<|image|>", add_special_tokens=False)[
-        0
-    ]
 
+    tokenizer.image_token_id = tokenizer.encode(tokenizer.image_token, add_special_tokens=False)[0]
     processor_config = {
-        "image_token": "<|image|>",
         "pooling_ratio": params["model"]["pooling_ratio"],
         "patch_size": params["model"]["vision_model"]["patch_size"],
         "processor_class": "PerceptionLMProcessor",
@@ -540,6 +540,7 @@ def write_tokenizer(
         image_processor=image_preprocessor,
         video_processor=video_preprocessor,
         tokenizer=tokenizer,
+        chat_template=CHAT_TEMPLATE,
         **processor_config,
     )
 
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 0210c806a19a..f635f20d4afb 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -296,9 +296,6 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-        if pixel_values_videos is not None:
-            pixel_values = pixel_values_videos
-
         if pixel_values is not None and inputs_embeds is not None:
             raise ValueError(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
@@ -312,12 +309,20 @@ def forward(
             image_features = self.get_image_features(
                 pixel_values=pixel_values.to(inputs_embeds),
             )
-
             special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
             special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
-            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            image_features = image_features.to(inputs_embeds)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
+        if pixel_values_videos is not None:
+            video_features = self.get_image_features(
+                pixel_values=pixel_values_videos.to(inputs_embeds),
+            )
+            special_video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1)
+            special_video_mask = special_video_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            video_features = video_features.to(inputs_embeds)
+            inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
+
         outputs = self.language_model(
             attention_mask=attention_mask,
             position_ids=position_ids,
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 084a9c818e65..4959428eb272 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -160,6 +160,7 @@ def forward(self, features):
 class PerceptionLMPreTrainedModel(LlavaPreTrainedModel):
     base_model_prefix = "model"
 
+
 @auto_docstring
 class PerceptionLMModel(LlavaModel):
     def __init__(self, config: PerceptionLMConfig):
@@ -262,9 +263,6 @@ def forward(
             raise ValueError(
                 "You must specify exactly one of input_ids or inputs_embeds"
             )
-        if pixel_values_videos is not None:
-            pixel_values = pixel_values_videos
-
         if pixel_values is not None and inputs_embeds is not None:
             raise ValueError(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
@@ -278,18 +276,28 @@ def forward(
             image_features = self.get_image_features(
                 pixel_values=pixel_values.to(inputs_embeds),
             )
-
             special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
             special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
                 inputs_embeds.device
             )
-            image_features = image_features.to(
-                inputs_embeds.device, inputs_embeds.dtype
-            )
+            image_features = image_features.to(inputs_embeds)
             inputs_embeds = inputs_embeds.masked_scatter(
                 special_image_mask, image_features
             )
 
+        if pixel_values_videos is not None:
+            video_features = self.get_image_features(
+                pixel_values=pixel_values_videos.to(inputs_embeds),
+            )
+            special_video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1)
+            special_video_mask = special_video_mask.expand_as(inputs_embeds).to(
+                inputs_embeds.device
+            )
+            video_features = video_features.to(inputs_embeds)
+            inputs_embeds = inputs_embeds.masked_scatter(
+                special_video_mask, video_features
+            )
+
         outputs = self.language_model(
             attention_mask=attention_mask,
             position_ids=position_ids,
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 9c8c9ddb9130..b6aeb5a7cde5 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -16,7 +16,8 @@
 Processor class for PerceptionLM.
 """
 
-from typing import List, Union
+import torch
+from typing import List, Union, Iterable
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, get_image_size, to_numpy_array
@@ -92,13 +93,11 @@ def __init__(
         self.patch_size = patch_size
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.media_token = tokenizer.media_token if hasattr(tokenizer, "media_token") else media_token
         self.pooling_ratio = pooling_ratio
-        self.media_token_id = (
-            tokenizer.media_token_id
-            if getattr(tokenizer, "media_token_id", None)
-            else tokenizer.convert_tokens_to_ids(self.media_token)
-        )
+        self.image_token = tokenizer.image_token
+        self.video_token = tokenizer.video_token
+        self.image_token_id = tokenizer.image_token_id
+        self.video_token_id = tokenizer.video_token_id
         super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
@@ -114,7 +113,7 @@ def __call__(
         and `kwargs` arguments to PerceptionLMTokenizerFast's [`~PerceptionLMTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
         CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
+    of the above two methods for more information.
 
         Args:
             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
@@ -170,37 +169,40 @@ def __call__(
 
         # try to expand inputs in processing if we have the necessary parts
         prompt_strings = []
-        pixel_values = None
-        pixel_values_videos = None
-
-        if image_inputs.get("pixel_values") is not None:
-            pixel_values = image_inputs["pixel_values"]
-        if videos_inputs.get("pixel_values_videos") is not None:
-            pixel_values_videos = videos_inputs["pixel_values_videos"]
-        for i, sample in enumerate(text):
-            if pixel_values is not None:
-                media = pixel_values[i]
-            elif pixel_values_videos is not None:
-                media = pixel_values_videos[i]
-            else:
-                continue 
-            
+
+        pixel_values = iter(image_inputs.get("pixel_values", []))
+        pixel_values_videos = iter(videos_inputs.get("pixel_values_videos", []))
+        for sample in text:       
             # Replace the media token with the expanded media token sequence
-            print("media.shape", media.shape)
-            height, width = get_image_size(to_numpy_array(media))
-            num_tiles = media.shape[0]
-            num_media_tokens = (height // self.patch_size // self.pooling_ratio) * (
-                width // self.patch_size // self.pooling_ratio
-            ) * num_tiles
-            print("num_media_tokens", num_media_tokens)
-            print("self.media_token", self.media_token)
-            sample = sample.replace(self.media_token, self.media_token * num_media_tokens)
+            sample = self._expand_media_tokens(sample, self.tokenizer.image_token, pixel_values)
+            sample = self._expand_media_tokens(sample, self.tokenizer.video_token, pixel_values_videos)
             prompt_strings.append(sample)
 
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
-        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["media"])
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image", "video"])
         return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
+    
+    def _expand_media_tokens(self, sample, media_token: str, media_iter: Iterable):
+        media_count = sample.count(media_token)
+        if media_count > 0:
+            media_list = [next(media_iter) for _ in range(media_count)]
+            sample_splits = sample.split(media_token)
+            media_token_list = []
+            for media in media_list:
+                height, width = get_image_size(to_numpy_array(media))
+                num_tiles = media.shape[0]
+                num_media_tokens = (height // self.patch_size // self.pooling_ratio) * (
+                    width // self.patch_size // self.pooling_ratio
+                ) * num_tiles
+                print("num_media_tokens", num_media_tokens)
+                media_token_list.append(num_media_tokens)
+            sample = ""
+            for i, num_media_tokens in enumerate(media_token_list):
+                sample += sample_splits[i]
+                sample += media_token * num_media_tokens
+            sample += sample_splits[-1]
+        return sample
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/test.py b/test.py
index e92d57a2b922..74a8128c9c15 100644
--- a/test.py
+++ b/test.py
@@ -7,7 +7,7 @@
 print(type(processor))
 
 model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to(torch.bfloat16).to("cuda")
-conversation = [
+conversation1 = [
     {
         "role": "user",
         "content": [
@@ -19,14 +19,29 @@
         ],
     }
 ]
-
+conversation2 = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4",
+            },
+            {"type": "text", "text": "Can you describe the video in detail?"},
+        ],
+    }
+]
 # print(model.config)
 inputs = processor.apply_chat_template(
-    conversation,
+    [conversation1, conversation2],
+    num_frames=32,
     add_generation_prompt=True,
     tokenize=True,
     return_dict=True,
     return_tensors="pt",
+    video_load_backend="decord",
+    padding=True,
+    padding_side="left",
 )
 inputs = inputs.to(model.device)
 # torch.save(inputs['pixel_values'], "/checkpoint/vision_encoder/smhu/debug/0/pixel_values_dump_0.pt")
diff --git a/test_video.py b/test_video.py
index bc3bc04e895f..e25ca3790e88 100644
--- a/test_video.py
+++ b/test_video.py
@@ -3,10 +3,10 @@
 from transformers import AutoProcessor
 from transformers import PerceptionLMForConditionalGeneration
 
-processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_8b")
+processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b")
 print(type(processor))
 
-model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_8b").to(torch.bfloat16).to("cuda")
+model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to(torch.bfloat16).to("cuda")
 conversation = [
     {
         "role": "user",

From 0f8663a3b082b865994031b953d7cf7d197b44a5 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 23 May 2025 00:07:26 +0000
Subject: [PATCH 19/65] Simplify PE configuration.

---
 .../configuration_perception_lm.py            | 26 +++++-----
 .../convert_perception_lm_weights_to_hf.py    | 48 +++++++++++++++----
 .../perception_lm/modeling_perception_lm.py   | 35 +++++---------
 .../perception_lm/modular_perception_lm.py    | 41 +++++-----------
 4 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index 73b47f223559..098a2823cf20 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -13,26 +13,26 @@
 # limitations under the License.
 """PerceptionLM model configuration"""
 
-from ...configuration_utils import PretrainedConfig
+from transformers.configuration_utils import PretrainedConfig
 from ...utils import logging
 from ..auto import CONFIG_MAPPING, AutoConfig
+from typing import Tuple
 
 
 logger = logging.get_logger(__name__)
 
+
 class PerceptionEncoderConfig(PretrainedConfig):
-    image_size: int = 448
-    patch_size: int = 14
-    width: int = 1024
-    layers: int = 23
-    heads: int = 16
-    use_cls_token: bool = True
-    use_abs_posemb: bool = True
-    ls_init_value: float = 0.1
-    drop_path: float = 0.1
-    mlp_ratio: float = 4.0
-    use_ln_post: bool = False
-    pool_type: str = "none"
+    model_type = "perception_encoder"
+    architecture = "vit_pe_core_large_patch14_336"
+    width = 1024
+    img_size = (448, 448)
+    depth = 23
+    num_classes = 0
+    global_pool = ""
+    use_post_transformer_norm = False
+    init_values = 0.1
+    ref_feat_shape = (32, 32)
 
 class PerceptionLMConfig(PretrainedConfig):
     r"""
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index db424ea8e22e..d5cfcbc96c3a 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -344,12 +344,32 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             for k, v in loaded.items()
             if "vision_model" in k
         }
-        vision_config = PerceptionEncoderConfig(**model_params["vision_model"])
-        perception_encoder = PerceptionEncoder(vision_config)
-        state_dict = checkpoint_filter_fn(
-            state_dict, perception_encoder.eva_pe
+        vision_params = model_params["vision_model"]
+        if vision_params["layers"] == 23 and vision_params["width"] == 1024:
+            vision_model_name = "vit_pe_core_large_patch14_336"
+        elif vision_params["layers"] == 47 and vision_params["width"] == 1536:
+            vision_model_name = "vit_pe_core_gigantic_patch14_448"
+        else:
+            raise ValueError(f"Unsupported PE config: {vision_params['layers']} layers and {vision_params['width']} width")
+        
+        vision_config = PerceptionEncoderConfig(
+            use_cls_token=vision_params["use_cls_token"],
+            width=vision_params["width"],
+            model_name=vision_model_name,
+            img_size=(vision_params["image_size"], vision_params["image_size"]),
+            depth=vision_params["layers"],
+            num_classes=0,
+            global_pool="",
+            use_post_transformer_norm=vision_params["use_ln_post"],
+            init_values=vision_params["ls_init_value"],
+            ref_feat_shape=(
+                vision_params["image_size"] // vision_params["patch_size"],
+                vision_params["image_size"] // vision_params["patch_size"],
+            ),
         )
-        state_dict = { "vision_tower.eva_pe." + k: v for k, v in state_dict.items()}
+        perception_encoder = PerceptionEncoder(vision_config)
+        state_dict = checkpoint_filter_fn(state_dict, perception_encoder.eva_pe)
+        state_dict = {"vision_tower.eva_pe." + k: v for k, v in state_dict.items()}
         for k, v in state_dict.items():
             index_dict["weight_map"][k] = filename
             param_count += v.numel()
@@ -468,10 +488,18 @@ def __init__(
             model_input_names=["input_ids", "attention_mask"],
             model_max_length=context_length,
             clean_up_tokenization_spaces=True,
-            extra_special_tokens={"image_token": "<|image|>", "video_token": "<|video|>", "pad_token": "<|end_of_text|>"},
+            extra_special_tokens={
+                "image_token": "<|image|>",
+                "video_token": "<|video|>",
+                "pad_token": "<|end_of_text|>",
+            },
         )
-        self.converted_tokenizer.image_token_id = self.converted_tokenizer.encode(self.converted_tokenizer.image_token, add_special_tokens=False)[0]
-        self.converted_tokenizer.video_token_id = self.converted_tokenizer.encode(self.converted_tokenizer.video_token, add_special_tokens=False)[0]
+        self.converted_tokenizer.image_token_id = self.converted_tokenizer.encode(
+            self.converted_tokenizer.image_token, add_special_tokens=False
+        )[0]
+        self.converted_tokenizer.video_token_id = self.converted_tokenizer.encode(
+            self.converted_tokenizer.video_token, add_special_tokens=False
+        )[0]
         self.update_post_processor(self.converted_tokenizer)
         # finer special_tokens_map.json
         self.converted_tokenizer._bos_token = BOS_ADDED_TOKEN
@@ -514,7 +542,9 @@ def write_tokenizer(
         context_length,
     ).converted_tokenizer
 
-    tokenizer.image_token_id = tokenizer.encode(tokenizer.image_token, add_special_tokens=False)[0]
+    tokenizer.image_token_id = tokenizer.encode(
+        tokenizer.image_token, add_special_tokens=False
+    )[0]
     processor_config = {
         "pooling_ratio": params["model"]["pooling_ratio"],
         "patch_size": params["model"]["vision_model"]["patch_size"],
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index f635f20d4afb..544f6bd77566 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -23,9 +23,9 @@
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
+import timm
 import torch
 import torch.nn.functional as F
-from timm.models.eva import vit_pe_core_gigantic_patch14_448, vit_pe_core_large_patch14_336
 from torch import nn
 
 from transformers.generation.utils import GenerationMixin
@@ -41,30 +41,17 @@
 class PerceptionEncoder(nn.Module):
     def __init__(self, config: PerceptionEncoderConfig):
         super().__init__()
-        assert config.pool_type == "none"
         self.use_cls_token = config.use_cls_token
-        kwargs = {
-            "img_size": (config.image_size, config.image_size),
-            "depth": config.layers,
-            "num_classes": 0,
-            "global_pool": "",
-            "use_post_transformer_norm": config.use_ln_post,
-            "init_values": config.ls_init_value,
-            "ref_feat_shape": (
-                config.image_size // config.patch_size,
-                config.image_size // config.patch_size,
-            ),
-        }
-        if config.layers == 23 and config.width == 1024:
-            self.eva_pe = vit_pe_core_large_patch14_336(
-                **kwargs,
-            )
-        elif config.layers == 47 and config.width == 1536:
-            self.eva_pe = vit_pe_core_gigantic_patch14_448(
-                **kwargs,
-            )
-        else:
-            raise ValueError(f"Unsupported PE config: {config.layers} layers and {config.width} width")
+        self.eva_pe = timm.create_model(
+            config.architecture,
+            img_size=config.img_size,
+            depth=config.depth,
+            num_classes=config.num_classes,
+            global_pool=config.global_pool,
+            use_post_transformer_norm=config.use_post_transformer_norm,
+            init_values=config.init_values,
+            ref_feat_shape=config.ref_feat_shape,
+        )
         self.eva_pe._initialize_weights = lambda x: x  # disable weight initialization
 
     def forward(self, x):
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 4959428eb272..990a8b908009 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -19,6 +19,8 @@
 
 import math
 
+import timm
+
 import torch
 import torch.utils.checkpoint
 from torch import nn
@@ -45,10 +47,6 @@
     logging,
     auto_docstring,
 )
-from timm.models.eva import (
-    vit_pe_core_large_patch14_336,
-    vit_pe_core_gigantic_patch14_448,
-)
 
 from ..auto import AutoModelForCausalLM
 
@@ -65,32 +63,17 @@
 class PerceptionEncoder(nn.Module):
     def __init__(self, config: PerceptionEncoderConfig):
         super().__init__()
-        assert config.pool_type == "none"
         self.use_cls_token = config.use_cls_token
-        kwargs = {
-            "img_size": (config.image_size, config.image_size),
-            "depth": config.layers,
-            "num_classes": 0,
-            "global_pool": "",
-            "use_post_transformer_norm": config.use_ln_post,
-            "init_values": config.ls_init_value,
-            "ref_feat_shape": (
-                config.image_size // config.patch_size,
-                config.image_size // config.patch_size,
-            ),
-        }
-        if config.layers == 23 and config.width == 1024:
-            self.eva_pe = vit_pe_core_large_patch14_336(
-                **kwargs,
-            )
-        elif config.layers == 47 and config.width == 1536:
-            self.eva_pe = vit_pe_core_gigantic_patch14_448(
-                **kwargs,
-            )
-        else:
-            raise ValueError(
-                f"Unsupported PE config: {config.layers} layers and {config.width} width"
-            )
+        self.eva_pe = timm.create_model(
+            config.architecture,
+            img_size=config.img_size,
+            depth=config.depth,
+            num_classes=config.num_classes,
+            global_pool=config.global_pool,
+            use_post_transformer_norm=config.use_post_transformer_norm,
+            init_values=config.init_values,
+            ref_feat_shape=config.ref_feat_shape,
+        )
         self.eva_pe._initialize_weights = lambda x: x  # disable weight initialization
 
     def forward(self, x):

From 82ff8bdbbbf5ffc354424c55897a3f5e38c07113 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 23 May 2025 00:58:39 +0000
Subject: [PATCH 20/65] Enable AutoModel for PerceptionEncoder.

---
 src/transformers/models/auto/configuration_auto.py  |  1 +
 src/transformers/models/auto/modeling_auto.py       |  1 +
 .../perception_lm/configuration_perception_lm.py    |  3 ++-
 .../models/perception_lm/modeling_perception_lm.py  | 13 +++++--------
 .../models/perception_lm/modular_perception_lm.py   | 12 +++++-------
 5 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index a68fe785763a..e8eea7733680 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -873,6 +873,7 @@
         ("llama4_text", "llama4"),
         ("blip_2_qformer", "blip_2"),
         ("fastspeech2_conformer_with_hifigan", "fastspeech2_conformer"),
+        ("perception_encoder", "perception_lm"),
     ]
 )
 
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index d795f1473968..af5ad482649d 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -255,6 +255,7 @@
         ("pegasus", "PegasusModel"),
         ("pegasus_x", "PegasusXModel"),
         ("perceiver", "PerceiverModel"),
+        ("perception_encoder", "PerceptionEncoder"),
         ("persimmon", "PersimmonModel"),
         ("phi", "PhiModel"),
         ("phi3", "Phi3Model"),
diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index 098a2823cf20..d38d1676f4bb 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -24,6 +24,7 @@
 
 class PerceptionEncoderConfig(PretrainedConfig):
     model_type = "perception_encoder"
+    use_cls_token = True
     architecture = "vit_pe_core_large_patch14_336"
     width = 1024
     img_size = (448, 448)
@@ -120,4 +121,4 @@ def __init__(
         super().__init__(**kwargs)
 
 
-__all__ = ["PerceptionLMConfig"]
+__all__ = ["PerceptionLMConfig", "PerceptionEncoderConfig"]
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 544f6bd77566..d8567f5a226c 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -34,13 +34,13 @@
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, can_return_tuple
-from ..auto import AutoModelForCausalLM
+from ..auto import AutoModel, AutoModelForCausalLM
 from .configuration_perception_lm import PerceptionEncoderConfig, PerceptionLMConfig
 
 
-class PerceptionEncoder(nn.Module):
+class PerceptionEncoder(PreTrainedModel):
     def __init__(self, config: PerceptionEncoderConfig):
-        super().__init__()
+        super().__init__(config)
         self.use_cls_token = config.use_cls_token
         self.eva_pe = timm.create_model(
             config.architecture,
@@ -61,9 +61,6 @@ def forward(self, x):
         else:
             return x
 
-    def _initialize_weights(self):
-        pass
-
 
 class AdaptiveAvgPooling(nn.Module):
     def __init__(self, pooling_ratio=2):
@@ -192,7 +189,7 @@ def __init__(self, config: PerceptionLMConfig):
         super().__init__(config)
         self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
-        self.vision_tower = PerceptionEncoder(config.vision_config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
         self.post_init()
 
     def get_input_embeddings(self):
@@ -469,4 +466,4 @@ def forward(
         )
 
 
-__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel"]
+__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel", "PerceptionEncoder"]
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 990a8b908009..05cf75790dd0 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -49,6 +49,7 @@
 )
 
 from ..auto import AutoModelForCausalLM
+from ..auto import AutoModel
 
 from .configuration_perception_lm import PerceptionEncoderConfig, PerceptionLMConfig
 
@@ -60,9 +61,9 @@
 _CHECKPOINT_FOR_DOC = "facebook/Perception-LM-1B"
 
 
-class PerceptionEncoder(nn.Module):
+class PerceptionEncoder(PreTrainedModel):
     def __init__(self, config: PerceptionEncoderConfig):
-        super().__init__()
+        super().__init__(config)
         self.use_cls_token = config.use_cls_token
         self.eva_pe = timm.create_model(
             config.architecture,
@@ -83,9 +84,6 @@ def forward(self, x):
         else:
             return x
 
-    def _initialize_weights(self):
-        pass
-
 
 class AdaptiveAvgPooling(nn.Module):
     def __init__(self, pooling_ratio=2):
@@ -149,7 +147,7 @@ class PerceptionLMModel(LlavaModel):
     def __init__(self, config: PerceptionLMConfig):
         super().__init__(config)
         del self.vision_tower
-        self.vision_tower = PerceptionEncoder(config.vision_config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
         self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
 
@@ -446,4 +444,4 @@ def forward(
         )
 
 
-__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel"]
+__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel", "PerceptionEncoder"]

From 66a021f1031eb1176eae8fcefee25a3025ea0e44 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 23 May 2025 01:08:17 +0000
Subject: [PATCH 21/65] Update PE config style.

---
 .../configuration_perception_lm.py            | 81 ++++++++++++++++---
 1 file changed, 71 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index d38d1676f4bb..2ee816c6a26c 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -23,17 +23,78 @@
 
 
 class PerceptionEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PerceptionEncoder`]. It is used to instantiate a
+    PerceptionEncoder model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        model_type (`str`, *optional*, defaults to `"perception_encoder"`):
+            The type of the model.
+        use_cls_token (`bool`, *optional*, defaults to `True`):
+            Whether to use a CLS token.
+        architecture (`str`, *optional*, defaults to `"vit_pe_core_large_patch14_336"`):
+            The architecture of the model.
+        width (`int`, *optional*, defaults to `1024`):
+            The width of the model.
+        img_size (`Tuple[int, int]`, *optional*, defaults to `(448, 448)`):
+            The size of the input image.
+        depth (`int`, *optional*, defaults to `23`):
+            The depth of the model.
+        num_classes (`int`, *optional*, defaults to `0`):
+            The number of classes for classification.
+        global_pool (`str`, *optional*, defaults to `""`):
+            The global pooling strategy.
+        use_post_transformer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to use post-transformer normalization.
+        init_values (`float`, *optional*, defaults to `0.1`):
+            The initialization values.
+        ref_feat_shape (`Tuple[int, int]`, *optional*, defaults to `(32, 32)`):
+            The shape of the reference feature.
+
+    Example:
+
+    ```python
+    >>> from transformers import PerceptionEncoder, PerceptionEncoderConfig
+
+    >>> # Initializing a PerceptionEncoder configuration
+    >>> configuration = PerceptionEncoderConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = PerceptionEncoder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
     model_type = "perception_encoder"
-    use_cls_token = True
-    architecture = "vit_pe_core_large_patch14_336"
-    width = 1024
-    img_size = (448, 448)
-    depth = 23
-    num_classes = 0
-    global_pool = ""
-    use_post_transformer_norm = False
-    init_values = 0.1
-    ref_feat_shape = (32, 32)
+    def __init__(
+        self,
+        use_cls_token=True,
+        architecture="vit_pe_core_large_patch14_336",
+        width=1024,
+        img_size=(448, 448),
+        depth=23,
+        num_classes=0,
+        global_pool="",
+        use_post_transformer_norm=False,
+        init_values=0.1,
+        ref_feat_shape=(32, 32),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.use_cls_token = use_cls_token
+        self.architecture = architecture
+        self.width = width
+        self.img_size = img_size
+        self.depth = depth
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.use_post_transformer_norm = use_post_transformer_norm
+        self.init_values = init_values
+        self.ref_feat_shape = ref_feat_shape
 
 class PerceptionLMConfig(PretrainedConfig):
     r"""

From 7d97732aa1bce85e3dad108b7b26aef6894d7aac Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 23 May 2025 01:18:39 +0000
Subject: [PATCH 22/65] update all headers

---
 .../models/perception_lm/configuration_perception_lm.py       | 2 +-
 .../perception_lm/convert_perception_lm_weights_to_hf.py      | 4 ++--
 .../perception_lm/image_processing_perception_lm_fast.py      | 4 +---
 .../models/perception_lm/video_processing_perception_lm.py    | 4 +---
 4 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index 2ee816c6a26c..88b00acb6a69 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2025 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index d5cfcbc96c3a..e5842e7b4576 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -1,5 +1,5 @@
-# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
-#
+# coding=utf-8
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 9c6ab3812adb..2e4c914cfd75 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -1,6 +1,4 @@
-# coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
-#
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/src/transformers/models/perception_lm/video_processing_perception_lm.py b/src/transformers/models/perception_lm/video_processing_perception_lm.py
index 58c0d8d1b651..9e6596e1c801 100644
--- a/src/transformers/models/perception_lm/video_processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/video_processing_perception_lm.py
@@ -1,6 +1,5 @@
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
-#
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -25,7 +24,6 @@
     BaseVideoProcessor,
 )
 
-
 if is_vision_available():
     from ...image_utils import PILImageResampling
 

From 70480d46dc282ed1433c44d9b272f66d5e24be2e Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 30 May 2025 20:52:02 +0000
Subject: [PATCH 23/65] Minor fixes.

---
 src/transformers/models/auto/modeling_auto.py               | 1 -
 .../models/perception_lm/configuration_perception_lm.py     | 2 +-
 .../models/perception_lm/modular_perception_lm.py           | 6 +++---
 test.py                                                     | 4 +---
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index af5ad482649d..77b7564e3951 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -409,7 +409,6 @@
         ("janus", "JanusForConditionalGeneration"),
         ("layoutlm", "LayoutLMForMaskedLM"),
         ("llava", "LlavaForConditionalGeneration"),
-        ("perception_lm", "PerceptionLMForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
         ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
         ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index 88b00acb6a69..cffd36c8cae3 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -150,7 +150,7 @@ class PerceptionLMConfig(PretrainedConfig):
     ```"""
 
     model_type = "perception_lm"
-    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+    sub_configs = {"text_config": AutoConfig, "vision_config": PerceptionEncoderConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 05cf75790dd0..c5d7c20650ff 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -1,6 +1,5 @@
 # coding=utf-8
-# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
-#
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -93,7 +92,8 @@ def __init__(self, pooling_ratio=2):
     def forward(self, x):
         b, num_tokens, c = x.shape
         h = int(math.sqrt(num_tokens))
-        assert h * h == num_tokens
+        if h * h != num_tokens:
+            raise ValueError(f"num_tokens {num_tokens} is expected to be a square number")
 
         shape = (h // self.pooling_ratio, h // self.pooling_ratio)
         x = x.permute(0, 2, 1).reshape(b, -1, h, h)
diff --git a/test.py b/test.py
index 74a8128c9c15..e234ebaf3f9a 100644
--- a/test.py
+++ b/test.py
@@ -1,12 +1,10 @@
-import torch
-
 from transformers import AutoProcessor
 from transformers import PerceptionLMForConditionalGeneration
 
 processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b", use_fast=True)
 print(type(processor))
 
-model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to(torch.bfloat16).to("cuda")
+model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to("cuda")
 conversation1 = [
     {
         "role": "user",

From 3d65bc9d47195d2d391e7eff0ae1c35a8918329f Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Sat, 31 May 2025 02:34:02 +0000
Subject: [PATCH 24/65] Move lm_head to PerceptionLMForConditionalGeneration.
 Fix vit_G model specification.

---
 .../configuration_perception_lm.py            |  3 +-
 .../convert_perception_lm_weights_to_hf.py    | 53 +++++++++---------
 .../image_processing_perception_lm_fast.py    |  2 -
 .../perception_lm/modeling_perception_lm.py   | 54 ++++++++-----------
 .../perception_lm/modular_perception_lm.py    | 52 +++++++-----------
 .../perception_lm/processing_perception_lm.py |  5 --
 test.py                                       |  6 +--
 7 files changed, 74 insertions(+), 101 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index cffd36c8cae3..77a6da7a29c5 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -159,6 +159,7 @@ def __init__(
         projector_pooling_ratio=1,
         image_token_id=128002,
         video_token_id=128003,
+        tie_word_embeddings=False,
         **kwargs,
     ):
         self.image_token_id = image_token_id
@@ -179,7 +180,7 @@ def __init__(
 
         self.text_config = text_config
         self.projector_pooling_ratio = projector_pooling_ratio
-        super().__init__(**kwargs)
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
 
 
 __all__ = ["PerceptionLMConfig", "PerceptionEncoderConfig"]
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index e5842e7b4576..b7ee4671753a 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -270,38 +270,38 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 2}.bin"
             assert num_shards == 1, "PerceptionLM does not support sharded weights"
             state_dict = {
-                f"language_model.model.layers.{layer_i}.self_attn.q_proj.weight": permute(
+                f"model.language_model.layers.{layer_i}.self_attn.q_proj.weight": permute(
                     loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads
                 ),
-                f"language_model.model.layers.{layer_i}.self_attn.k_proj.weight": permute(
+                f"model.language_model.layers.{layer_i}.self_attn.k_proj.weight": permute(
                     loaded[f"layers.{layer_i}.attention.wk.weight"],
                     n_heads=num_key_value_heads,
                     dim1=key_value_dim,
                 ),
-                f"language_model.model.layers.{layer_i}.self_attn.v_proj.weight": loaded[
+                f"model.language_model.layers.{layer_i}.self_attn.v_proj.weight": loaded[
                     f"layers.{layer_i}.attention.wv.weight"
                 ],
-                f"language_model.model.layers.{layer_i}.self_attn.o_proj.weight": loaded[
+                f"model.language_model.layers.{layer_i}.self_attn.o_proj.weight": loaded[
                     f"layers.{layer_i}.attention.wo.weight"
                 ],
-                f"language_model.model.layers.{layer_i}.mlp.gate_proj.weight": loaded[
+                f"model.language_model.layers.{layer_i}.mlp.gate_proj.weight": loaded[
                     f"layers.{layer_i}.feed_forward.w1.weight"
                 ],
-                f"language_model.model.layers.{layer_i}.mlp.down_proj.weight": loaded[
+                f"model.language_model.layers.{layer_i}.mlp.down_proj.weight": loaded[
                     f"layers.{layer_i}.feed_forward.w2.weight"
                 ],
-                f"language_model.model.layers.{layer_i}.mlp.up_proj.weight": loaded[
+                f"model.language_model.layers.{layer_i}.mlp.up_proj.weight": loaded[
                     f"layers.{layer_i}.feed_forward.w3.weight"
                 ],
-                f"language_model.model.layers.{layer_i}.input_layernorm.weight": loaded[
+                f"model.language_model.layers.{layer_i}.input_layernorm.weight": loaded[
                     f"layers.{layer_i}.attention_norm.weight"
                 ],
-                f"language_model.model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
+                f"model.language_model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
                     f"layers.{layer_i}.ffn_norm.weight"
                 ],
             }
             state_dict[
-                f"language_model.model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"
+                f"model.language_model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"
             ] = inv_freq
             for k, v in state_dict.items():
                 index_dict["weight_map"][k] = filename
@@ -312,26 +312,23 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 2}.bin"
 
         state_dict = {
-            "language_model.model.embed_tokens.weight": loaded["tok_embeddings.weight"],
-            "language_model.model.norm.weight": loaded["norm.weight"],
-            "language_model.lm_head.weight": (
-                loaded["output.weight"]
-                if not tie_word_embeddings
-                else loaded["tok_embeddings.weight"]
-            ),
-            "multi_modal_projector.projector.0.weight": loaded[
+            "model.language_model.embed_tokens.weight": loaded["tok_embeddings.weight"],
+            "model.language_model.norm.weight": loaded["norm.weight"],
+            "model.multi_modal_projector.projector.0.weight": loaded[
                 "vision_projector.projector.0.weight"
             ],
-            "multi_modal_projector.projector.2.weight": loaded[
+            "model.multi_modal_projector.projector.2.weight": loaded[
                 "vision_projector.projector.2.weight"
             ],
-            "multi_modal_projector.projector.0.bias": loaded[
+            "model.multi_modal_projector.projector.0.bias": loaded[
                 "vision_projector.projector.0.bias"
             ],
-            "multi_modal_projector.projector.2.bias": loaded[
+            "model.multi_modal_projector.projector.2.bias": loaded[
                 "vision_projector.projector.2.bias"
             ],
         }
+        if not tie_word_embeddings:
+            state_dict["lm_head.weight"] = loaded["output.weight"]
         for k, v in state_dict.items():
             index_dict["weight_map"][k] = filename
             param_count += v.numel()
@@ -346,16 +343,16 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         }
         vision_params = model_params["vision_model"]
         if vision_params["layers"] == 23 and vision_params["width"] == 1024:
-            vision_model_name = "vit_pe_core_large_patch14_336"
+            architecture = "vit_pe_core_large_patch14_336"
         elif vision_params["layers"] == 47 and vision_params["width"] == 1536:
-            vision_model_name = "vit_pe_core_gigantic_patch14_448"
+            architecture = "vit_pe_core_gigantic_patch14_448"
         else:
             raise ValueError(f"Unsupported PE config: {vision_params['layers']} layers and {vision_params['width']} width")
         
         vision_config = PerceptionEncoderConfig(
             use_cls_token=vision_params["use_cls_token"],
             width=vision_params["width"],
-            model_name=vision_model_name,
+            architecture=architecture,
             img_size=(vision_params["image_size"], vision_params["image_size"]),
             depth=vision_params["layers"],
             num_classes=0,
@@ -369,7 +366,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         )
         perception_encoder = PerceptionEncoder(vision_config)
         state_dict = checkpoint_filter_fn(state_dict, perception_encoder.eva_pe)
-        state_dict = {"vision_tower.eva_pe." + k: v for k, v in state_dict.items()}
+        state_dict = {"model.vision_tower.eva_pe." + k: v for k, v in state_dict.items()}
         for k, v in state_dict.items():
             index_dict["weight_map"][k] = filename
             param_count += v.numel()
@@ -431,6 +428,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             vision_config=vision_config.to_dict(),
             projector_pooling_ratio=projector_pooling_ratio,
             image_token_id=image_token_id,
+            tie_word_embeddings=tie_word_embeddings,
         )
 
         config.save_pretrained(tmp_model_path)
@@ -444,6 +442,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
 
         # Make space so we can load the model properly now.
         del state_dict
+        # output_weight = loaded.get("output.weight", None)
         del loaded
         gc.collect()
 
@@ -451,6 +450,10 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         model = PerceptionLMForConditionalGeneration.from_pretrained(
             tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
         )
+        # if not tie_word_embeddings:
+        #     if output_weight is None:
+        #         raise ValueError("Output weight/lm_head is not found in the checkpoint.")
+        #     model.lm_head.load_state_dict({"weight": output_weight})
 
         # Avoid saving this as part of the config.
         del model.config._name_or_path
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 2e4c914cfd75..8eef962bf4a3 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -42,8 +42,6 @@ class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     image_res: int = 448
     max_num_tiles: int = 36
     normalize_img: bool = True
-    return_tensors: Optional[Union[str, TensorType]] = None
-
 
 @add_start_docstrings(
     "Constructs a fast PerceptionLM image processor.",
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index d8567f5a226c..160c51d434c0 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -5,8 +5,7 @@
 #                          modular_perception_lm.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
-# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
-#
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -34,7 +33,7 @@
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, can_return_tuple
-from ..auto import AutoModel, AutoModelForCausalLM
+from ..auto import AutoModel
 from .configuration_perception_lm import PerceptionEncoderConfig, PerceptionLMConfig
 
 
@@ -70,7 +69,8 @@ def __init__(self, pooling_ratio=2):
     def forward(self, x):
         b, num_tokens, c = x.shape
         h = int(math.sqrt(num_tokens))
-        assert h * h == num_tokens
+        if h * h != num_tokens:
+            raise ValueError(f"num_tokens {num_tokens} is expected to be a square number")
 
         shape = (h // self.pooling_ratio, h // self.pooling_ratio)
         x = x.permute(0, 2, 1).reshape(b, -1, h, h)
@@ -116,7 +116,7 @@ def forward(self, features):
 @auto_docstring
 class PerceptionLMPreTrainedModel(PreTrainedModel):
     config_class = PerceptionLMConfig
-    base_model_prefix = "model"
+    base_model_prefix = ""
     supports_gradient_checkpointing = True
     _skip_keys_device_placement = "past_key_values"
     _supports_cache_class = True
@@ -188,7 +188,7 @@ class PerceptionLMModel(PerceptionLMPreTrainedModel):
     def __init__(self, config: PerceptionLMConfig):
         super().__init__(config)
         self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
-        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.language_model = AutoModel.from_config(config.text_config)
         self.vision_tower = AutoModel.from_config(config.vision_config)
         self.post_init()
 
@@ -322,19 +322,25 @@ def forward(
         )
         return outputs, image_features
 
-    def get_output_embeddings(self):
-        return self.language_model.get_output_embeddings()
-
 
 @auto_docstring
 class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
     def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         super().__init__(config, **super_kwargs)
         self.model = PerceptionLMModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.post_init()
 
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
     def get_output_embeddings(self):
-        return self.model.get_output_embeddings()
+        return self.lm_head
 
     def prepare_inputs_for_generation(
         self,
@@ -350,7 +356,7 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        model_inputs = self.model.language_model.prepare_inputs_for_generation(
+        model_inputs = super().prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
@@ -431,31 +437,17 @@ def forward(
             **lm_kwargs,
         )
 
-        logits = outputs[0]
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
 
         loss = None
         if labels is not None:
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                # we use the input attention mask to shift the logits and labels, because it is 2D.
-                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
-                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
-                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
-                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
-            else:
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)),
-                shift_labels.view(-1).to(shift_logits.device),
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **lm_kwargs
             )
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return PerceptionLMCausalLMOutputWithPast(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index c5d7c20650ff..07cfd728d61f 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -139,8 +139,7 @@ def forward(self, features):
 
 
 class PerceptionLMPreTrainedModel(LlavaPreTrainedModel):
-    base_model_prefix = "model"
-
+    base_model_prefix = ""
 
 @auto_docstring
 class PerceptionLMModel(LlavaModel):
@@ -149,10 +148,7 @@ def __init__(self, config: PerceptionLMConfig):
         del self.vision_tower
         self.vision_tower = AutoModel.from_config(config.vision_config)
         self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
-        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
-
-    def get_output_embeddings(self):
-        return self.language_model.get_output_embeddings()
+        self.language_model = AutoModel.from_config(config.text_config)
 
     def get_image_features(
         self,
@@ -299,14 +295,22 @@ def forward(
 class PerceptionLMForConditionalGeneration(
     PerceptionLMPreTrainedModel, GenerationMixin
 ):
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         super().__init__(config, **super_kwargs)
         self.model = PerceptionLMModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.post_init()
 
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
     def get_output_embeddings(self):
-        return self.model.get_output_embeddings()
+        return self.lm_head
 
     def prepare_inputs_for_generation(
         self,
@@ -322,7 +326,7 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        model_inputs = self.model.language_model.prepare_inputs_for_generation(
+        model_inputs = super().prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
@@ -403,37 +407,17 @@ def forward(
             **lm_kwargs,
         )
 
-        logits = outputs[0]
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
 
         loss = None
         if labels is not None:
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                # we use the input attention mask to shift the logits and labels, because it is 2D.
-                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
-                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(
-                    logits.device
-                )
-                shift_logits = logits[..., :-1, :][
-                    shift_attention_mask.to(logits.device) != 0
-                ].contiguous()
-                shift_labels = labels[..., 1:][
-                    shift_attention_mask.to(labels.device) != 0
-                ].contiguous()
-            else:
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)),
-                shift_labels.view(-1).to(shift_logits.device),
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **lm_kwargs
             )
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return PerceptionLMCausalLMOutputWithPast(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index b6aeb5a7cde5..5b8b6b70eb04 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -142,17 +142,12 @@ def __call__(
         if images is None and text is None:
             raise ValueError("You have to specify at least one of `images` or `text`.")
 
-        # check if images and text inputs are reversed for BC
-        images, text = _validate_images_text_input_order(images, text)
-
         output_kwargs = self._merge_kwargs(
             PerceptionLMProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
         if images is not None:
-            print("image_processor class", self.image_processor.__class__)
-            images = [] if images is None else images
             image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
         else:
             image_inputs = {}
diff --git a/test.py b/test.py
index e234ebaf3f9a..dda0be6f42ee 100644
--- a/test.py
+++ b/test.py
@@ -1,10 +1,10 @@
 from transformers import AutoProcessor
 from transformers import PerceptionLMForConditionalGeneration
 
-processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b", use_fast=True)
-print(type(processor))
 
-model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to("cuda")
+MODEL_PATH = "/checkpoint/vision_encoder/smhu/debug/plm_hf_1b"
+processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
+model = PerceptionLMForConditionalGeneration.from_pretrained(MODEL_PATH).to("cuda")
 conversation1 = [
     {
         "role": "user",

From 1137a6791b491c73ccbd32a9f6535eaa39602f35 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 11 Jun 2025 22:57:55 +0000
Subject: [PATCH 25/65] Fix for testing_modeling_perception_lm.py

---
 .../configuration_perception_lm.py            |   6 +-
 .../perception_lm/modeling_perception_lm.py   |  16 +-
 .../perception_lm/modular_perception_lm.py    |  37 +-
 .../test_modeling_perception_lm.py            | 498 ++++++------------
 4 files changed, 201 insertions(+), 356 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index 77a6da7a29c5..bc08ba0fcacf 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -75,13 +75,13 @@ def __init__(
         use_cls_token=True,
         architecture="vit_pe_core_large_patch14_336",
         width=1024,
-        img_size=(448, 448),
+        img_size=[448, 448],
         depth=23,
         num_classes=0,
         global_pool="",
         use_post_transformer_norm=False,
         init_values=0.1,
-        ref_feat_shape=(32, 32),
+        ref_feat_shape=[32, 32],
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -112,7 +112,7 @@ class PerceptionLMConfig(PretrainedConfig):
             The config object or dictionary of the vision backbone.
         text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
             The config object or dictionary of the text backbone.
-        image_token_index (`int`, *optional*, defaults to 32000):
+        image_token_id (`int`, *optional*, defaults to 32000):
             The image token index to encode the image prompt.
         projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
             The activation function used by the multimodal projector.
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 160c51d434c0..14d2690de08d 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -50,6 +50,7 @@ def __init__(self, config: PerceptionEncoderConfig):
             use_post_transformer_norm=config.use_post_transformer_norm,
             init_values=config.init_values,
             ref_feat_shape=config.ref_feat_shape,
+            embed_dim=config.width,
         )
         self.eva_pe._initialize_weights = lambda x: x  # disable weight initialization
 
@@ -294,6 +295,7 @@ def forward(
                 pixel_values=pixel_values.to(inputs_embeds),
             )
             special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
+            self.check_mask_feature_size_match(special_image_mask, image_features)
             special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             image_features = image_features.to(inputs_embeds)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
@@ -303,6 +305,7 @@ def forward(
                 pixel_values=pixel_values_videos.to(inputs_embeds),
             )
             special_video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1)
+            self.check_mask_feature_size_match(special_video_mask, video_features)
             special_video_mask = special_video_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             video_features = video_features.to(inputs_embeds)
             inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
@@ -322,6 +325,14 @@ def forward(
         )
         return outputs, image_features
 
+    def check_mask_feature_size_match(self, media_mask, media_features):
+        media_token_count = media_mask.sum()
+        media_feature_size = media_features.size()[:-1].numel()
+        if media_token_count != media_feature_size:
+            raise ValueError(
+                f"The number of tokens in the media mask ({media_token_count}) does not match the number of features in the media features ({media_feature_size}. Features shape: {media_features.shape})"
+            )
+
 
 @auto_docstring
 class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin):
@@ -445,7 +456,10 @@ def forward(
         loss = None
         if labels is not None:
             loss = self.loss_function(
-                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **lm_kwargs
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.text_config.vocab_size,
+                **lm_kwargs,
             )
 
         return PerceptionLMCausalLMOutputWithPast(
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 07cfd728d61f..bf7ebfc84817 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -73,6 +73,7 @@ def __init__(self, config: PerceptionEncoderConfig):
             use_post_transformer_norm=config.use_post_transformer_norm,
             init_values=config.init_values,
             ref_feat_shape=config.ref_feat_shape,
+            embed_dim=config.width,
         )
         self.eva_pe._initialize_weights = lambda x: x  # disable weight initialization
 
@@ -93,7 +94,9 @@ def forward(self, x):
         b, num_tokens, c = x.shape
         h = int(math.sqrt(num_tokens))
         if h * h != num_tokens:
-            raise ValueError(f"num_tokens {num_tokens} is expected to be a square number")
+            raise ValueError(
+                f"num_tokens {num_tokens} is expected to be a square number"
+            )
 
         shape = (h // self.pooling_ratio, h // self.pooling_ratio)
         x = x.permute(0, 2, 1).reshape(b, -1, h, h)
@@ -141,6 +144,7 @@ def forward(self, features):
 class PerceptionLMPreTrainedModel(LlavaPreTrainedModel):
     base_model_prefix = ""
 
+
 @auto_docstring
 class PerceptionLMModel(LlavaModel):
     def __init__(self, config: PerceptionLMConfig):
@@ -168,6 +172,14 @@ def get_image_features(
         image_features = self.multi_modal_projector(image_outputs)
         return image_features
 
+    def check_mask_feature_size_match(self, media_mask, media_features):
+        media_token_count = media_mask.sum()
+        media_feature_size = media_features.size()[:-1].numel()
+        if media_token_count != media_feature_size:
+            raise ValueError(
+                f"The number of tokens in the media mask ({media_token_count}) does not match the number of features in the media features ({media_feature_size}. Features shape: {media_features.shape})"
+            )
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -254,6 +266,7 @@ def forward(
                 pixel_values=pixel_values.to(inputs_embeds),
             )
             special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
+            self.check_mask_feature_size_match(special_image_mask, image_features)
             special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
                 inputs_embeds.device
             )
@@ -267,6 +280,7 @@ def forward(
                 pixel_values=pixel_values_videos.to(inputs_embeds),
             )
             special_video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1)
+            self.check_mask_feature_size_match(special_video_mask, video_features)
             special_video_mask = special_video_mask.expand_as(inputs_embeds).to(
                 inputs_embeds.device
             )
@@ -300,7 +314,9 @@ class PerceptionLMForConditionalGeneration(
     def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         super().__init__(config, **super_kwargs)
         self.model = PerceptionLMModel(config)
-        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.lm_head = nn.Linear(
+            config.text_config.hidden_size, config.text_config.vocab_size, bias=False
+        )
         self.post_init()
 
     def get_input_embeddings(self):
@@ -409,13 +425,20 @@ def forward(
 
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        slice_indices = (
+            slice(-logits_to_keep, None)
+            if isinstance(logits_to_keep, int)
+            else logits_to_keep
+        )
         logits = self.lm_head(hidden_states[:, slice_indices, :])
 
         loss = None
         if labels is not None:
             loss = self.loss_function(
-                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **lm_kwargs
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.text_config.vocab_size,
+                **lm_kwargs,
             )
 
         return PerceptionLMCausalLMOutputWithPast(
@@ -428,4 +451,8 @@ def forward(
         )
 
 
-__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel", "PerceptionEncoder"]
+__all__ = [
+    "PerceptionLMForConditionalGeneration",
+    "PerceptionLMPreTrainedModel",
+    "PerceptionEncoder",
+]
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index ed3e77a84a34..8d9d2b0ef144 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -15,6 +15,7 @@
 
 import unittest
 
+import numpy as np
 import requests
 from parameterized import parameterized
 
@@ -52,12 +53,11 @@ class PerceptionLMVisionText2TextModelTester:
     def __init__(
         self,
         parent,
-        ignore_index=-100,
-        image_token_index=0,
-        projector_hidden_act="gelu",
+        image_token_id=0,
+        video_token_id=2,
         seq_length=7,
-        vision_feature_select_strategy="default",
-        vision_feature_layer=-1,
+        tie_word_embeddings=True,
+        projector_pooling_ratio=1,
         text_config={
             "model_type": "llama",
             "seq_length": 7,
@@ -83,26 +83,21 @@ def __init__(
         },
         is_training=True,
         vision_config={
-            "image_size": 8,
-            "patch_size": 2,
-            "num_channels": 3,
-            "is_training": True,
-            "hidden_size": 32,
-            "projection_dim": 32,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "intermediate_size": 37,
-            "dropout": 0.1,
-            "attention_dropout": 0.1,
-            "initializer_range": 0.02,
+            "use_cls_token": True,
+            "architecture": "vit_pe_core_large_patch14_336",
+            "width": 64,
+            "img_size": (14, 14),
+            "depth": 2,
+            "num_classes": 0,
+            "global_pool": "",
+            "use_post_transformer_norm": False,
+            "init_values": 0.1,
+            "ref_feat_shape": (1, 1),
         },
     ):
         self.parent = parent
-        self.ignore_index = ignore_index
-        self.image_token_index = image_token_index
-        self.projector_hidden_act = projector_hidden_act
-        self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.vision_feature_layer = vision_feature_layer
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
         self.text_config = text_config
         self.vision_config = vision_config
         self.pad_token_id = text_config["pad_token_id"]
@@ -112,11 +107,16 @@ def __init__(
         self.hidden_size = text_config["hidden_size"]
         self.num_attention_heads = text_config["num_attention_heads"]
         self.is_training = is_training
+        self.tie_word_embeddings = tie_word_embeddings
 
         self.batch_size = 3
+        self.num_tiles = 1
+        self.num_frames = 1
         self.num_channels = 3
-        self.image_size = 336
-        self.num_image_tokens = (self.vision_config["image_size"] // self.vision_config["patch_size"]) ** 2
+        self.image_size =  self.vision_config["img_size"][0]
+        self.num_image_tokens = (
+            self.vision_config["img_size"][0] // 14
+        ) ** 2
         self.seq_length = seq_length + self.num_image_tokens
         self.encoder_seq_length = self.seq_length
 
@@ -124,36 +124,53 @@ def get_config(self):
         return PerceptionLMConfig(
             text_config=self.text_config,
             vision_config=self.vision_config,
-            ignore_index=self.ignore_index,
-            image_token_index=self.image_token_index,
-            projector_hidden_act=self.projector_hidden_act,
-            vision_feature_select_strategy=self.vision_feature_select_strategy,
-            vision_feature_layer=self.vision_feature_layer,
-            image_seq_length=self.num_image_tokens,
+            image_token_id=self.image_token_id,
+            video_token_id=self.video_token_id,
+            tie_word_embeddings=self.tie_word_embeddings,
         )
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor(
             [
                 self.batch_size,
-                self.vision_config["num_channels"],
-                self.vision_config["image_size"],
-                self.vision_config["image_size"],
+                self.num_tiles,
+                self.num_channels,
+                self.vision_config["img_size"][0],
+                self.vision_config["img_size"][1],
+            ]
+        )
+        pixel_values_videos = floats_tensor(
+            [
+                self.batch_size,
+                self.num_frames,
+                self.num_channels,
+                self.vision_config["img_size"][0],
+                self.vision_config["img_size"][1],
             ]
         )
         config = self.get_config()
 
-        return config, pixel_values
+        return config, pixel_values, pixel_values_videos
 
     def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
-        attention_mask = input_ids.ne(1).to(torch_device)
-        input_ids[input_ids == config.image_token_index] = self.pad_token_id
-        input_ids[:, : self.num_image_tokens] = config.image_token_index
+        config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs()
+        input_ids = (
+            ids_tensor(
+                [self.batch_size, self.seq_length], config.text_config.vocab_size - 2
+            )
+            + 2
+        )
+        attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
+        input_ids[input_ids == config.image_token_id] = self.pad_token_id
+        input_ids[input_ids == config.video_token_id] = self.pad_token_id
+        input_ids[:, : self.num_image_tokens] = config.image_token_id
+        # input_ids[
+        #     :, self.num_image_tokens : self.num_video_tokens + self.num_image_tokens
+        # ] = config.video_token_id
+
         inputs_dict = {
             "pixel_values": pixel_values,
+            # "pixel_values_videos": pixel_values_videos,
             "input_ids": input_ids,
             "attention_mask": attention_mask,
         }
@@ -161,21 +178,31 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class PerceptionLMForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class PerceptionLMForConditionalGenerationModelTest(
+    ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
+):
     """
     Model tester for `PerceptionLMForConditionalGeneration`.
     """
 
-    all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
+    all_model_classes = (
+        (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
+    )
     test_pruning = False
     test_head_masking = False
     _is_composite = True
 
     def setUp(self):
         self.model_tester = PerceptionLMVisionText2TextModelTester(self)
-        common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"]
+        common_properties = [
+            "image_token_id",
+            "video_token_id",
+        ]
         self.config_tester = ConfigTester(
-            self, config_class=PerceptionLMConfig, has_text_modality=False, common_properties=common_properties
+            self,
+            config_class=PerceptionLMConfig,
+            has_text_modality=False,
+            common_properties=common_properties,
         )
 
     def test_config(self):
@@ -253,31 +280,6 @@ def test_mismatching_num_image_tokens(self):
             pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
             _ = model(input_ids=input_ids, pixel_values=pixel_values)
 
-    @parameterized.expand(
-        [
-            (-1,),
-            ([-1],),
-            ([-1, -2],),
-        ],
-    )
-    def test_vision_feature_layers(self, vision_feature_layer):
-        """
-        Test that we can use either one vision feature layer, or a list of
-        vision feature layers.
-        """
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.vision_feature_layer = vision_feature_layer
-
-        num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer)
-        hidden_size = config.vision_config.hidden_size
-        expected_features = hidden_size * num_feature_layers
-
-        for model_class in self.all_model_classes:
-            model = model_class(config).to(torch_device)
-            # We should have the right number of input features,
-            # and should be able to run a forward pass without exploding
-            assert model.multi_modal_projector.linear_1.in_features == expected_features
-            model(**input_dict)
 
     @unittest.skip(
         reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
@@ -304,10 +306,44 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
         pass
 
 
+TEST_MODEL_PATH = "/checkpoint/vision_encoder/smhu/debug/plm_hf_1b"
+
+
 @require_torch
 class PerceptionLMForConditionalGenerationIntegrationTest(unittest.TestCase):
     def setUp(self):
-        self.processor = AutoProcessor.from_pretrained("perception_lm-hf/bakPerceptionLM-v1-hf")
+        self.processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
+        # image_file = hf_hub_download(
+        #     repo_id="raushan-testing-hf/images_test", filename="llava_v1_5_radar.jpg", repo_type="dataset"
+        # )
+        # video_file = hf_hub_download(
+        #     repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
+        # )
+        self.image_file = (
+            "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/14496_0.PNG"
+        )
+        self.video_file = "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4"
+        self.conversation1 = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": self.image_file},
+                    {"type": "text", "text": "Describe the bar plot in the image."},
+                ],
+            }
+        ]
+        self.conversation2 = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "url": self.video_file,
+                    },
+                    {"type": "text", "text": "Can you describe the video in detail?"},
+                ],
+            }
+        ]
 
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
@@ -315,190 +351,75 @@ def tearDown(self):
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test(self):
-        # Let's make sure we test the preprocessing to replace what is used
-        model = PerceptionLMForConditionalGeneration.from_pretrained("perception_lm-hf/bakPerceptionLM-v1-hf", load_in_4bit=True)
-
-        prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
-        image_file = "https://perception_lm-vl.github.io/static/images/view.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device)
-
-        output = model.generate(**inputs, max_new_tokens=20)
-        EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,"  # fmt: skip
-
-        self.assertEqual(
-            self.processor.decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    @require_bitsandbytes
-    def test_small_model_integration_test_llama_single(self):
-        # Let's make sure we test the preprocessing to replace what is used
-        model_id = "facebook/Perception-LM-1B"
-
-        model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B", load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place? ASSISTANT:"
-        image_file = "https://perception_lm-vl.github.io/static/images/view.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
-
-        output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
-        EXPECTED_DECODED_TEXT = "USER:  \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area."  # fmt: skip
-
-        self.assertEqual(
-            processor.decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    @require_bitsandbytes
-    def test_small_model_integration_test_llama_batched(self):
-        # Let's make sure we test the preprocessing to replace what is used
-        model_id = "facebook/Perception-LM-1B"
-
-        model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B", load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        prompts = [
-            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT:",
-            "USER: <image>\nWhat is this? ASSISTANT:",
-        ]
-        image1 = Image.open(requests.get("https://perception_lm-vl.github.io/static/images/view.jpg", stream=True).raw)
-        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-
-        inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(torch_device)
-
-        output = model.generate(**inputs, max_new_tokens=20)
-
-        EXPECTED_DECODED_TEXT = ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, you', 'USER:  \nWhat is this? ASSISTANT: The image features two cats lying down on a pink couch. One cat is located on']  # fmt: skip
-
-        self.assertEqual(
-            processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
+        model = PerceptionLMForConditionalGeneration.from_pretrained(
+            TEST_MODEL_PATH, load_in_4bit=True, cache_dir="./"
         )
 
-    @slow
-    @require_bitsandbytes
-    def test_small_model_integration_test_batch(self):
-        # Let's make sure we test the preprocessing to replace what is used
-        model = PerceptionLMForConditionalGeneration.from_pretrained("perception_lm-hf/bakPerceptionLM-v1-hf", load_in_4bit=True)
-        # The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
-        prompts = [
-            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
-            "USER: <image>\nWhat is this?\nASSISTANT:",
-        ]
-        image1 = Image.open(requests.get("https://perception_lm-vl.github.io/static/images/view.jpg", stream=True).raw)
-        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+        inputs = self.processor.apply_chat_template(
+            [self.conversation1],
+            num_frames=32,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            video_load_backend="decord",
+            padding=True,
+            padding_side="left",
+        ).to(torch_device)
 
-        inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(
-            torch_device
-        )
+        generate_ids = model.generate(**inputs, max_new_tokens=18)
+        input_length = inputs["input_ids"].shape[1]
+        generate_ids_without_inputs = generate_ids[:, input_length:]
 
-        output = model.generate(**inputs, max_new_tokens=20)
+        EXPECTED_DECODED_TEXT = "The bar plot displays the values of four categories: step, horror, mood, and lumber"  # fmt: skip
 
-        EXPECTED_DECODED_TEXT = [
-            'USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring.',
-            'USER:  \nWhat is this?\nASSISTANT: Cats'
-        ]  # fmt: skip
         self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
+            self.processor.decode(
+                generate_ids_without_inputs[0], skip_special_tokens=True
+            ),
             EXPECTED_DECODED_TEXT,
         )
 
     @slow
     @require_bitsandbytes
-    def test_small_model_integration_test_llama_batched_regression(self):
-        # Let's make sure we test the preprocessing to replace what is used
-        model_id = "facebook/Perception-LM-1B"
-
-        # Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
+    def test_small_model_integration_test_batched(self):
         model = PerceptionLMForConditionalGeneration.from_pretrained(
-            "facebook/Perception-LM-1B", load_in_4bit=True, attn_implementation="eager"
-        )
-        processor = AutoProcessor.from_pretrained(model_id, pad_token="<pad>")
-
-        prompts = [
-            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
-            "USER: <image>\nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: <image>\nAnd this?\nASSISTANT:",
-        ]
-        image1 = Image.open(requests.get("https://perception_lm-vl.github.io/static/images/view.jpg", stream=True).raw)
-        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-
-        inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True).to(
-            torch_device
-        )
-
-        output = model.generate(**inputs, max_new_tokens=20)
-
-        EXPECTED_DECODED_TEXT = ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a body of water', 'USER:  \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER:  \nAnd this?\nASSISTANT: A cat sleeping on a bed.']  # fmt: skip
-
-        self.assertEqual(
-            processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
+            TEST_MODEL_PATH, load_in_4bit=True
         )
-
-    @slow
-    @require_torch
-    @require_vision
-    def test_batched_generation(self):
-        model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B", load_in_4bit=True)
-
-        processor = AutoProcessor.from_pretrained("facebook/Perception-LM-1B")
-
-        prompt1 = "<image>\n<image>\nUSER: What's the difference of two images?\nASSISTANT:"
-        prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
-        prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
-        url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
-        url2 = "https://images.unsplash.com/photo-1617258683320-61900b281ced?q=80&w=3087&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
-        image1 = Image.open(requests.get(url1, stream=True).raw)
-        image2 = Image.open(requests.get(url2, stream=True).raw)
-
-        inputs = processor(
-            images=[image1, image2, image1, image2],
-            text=[prompt1, prompt2, prompt3],
+        processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
+        inputs = processor.apply_chat_template(
+            [self.conversation1, self.conversation2],
+            num_frames=32,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
             return_tensors="pt",
+            video_load_backend="decord",
             padding=True,
+            padding_side="left",
         ).to(torch_device)
 
-        model = model.eval()
-
-        EXPECTED_OUTPUT = [
-            "\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
-            "\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
-            "\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
-        ]
-
-        generate_ids = model.generate(**inputs, max_new_tokens=20)
-        outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        self.assertEqual(outputs, EXPECTED_OUTPUT)
+        generate_ids = model.generate(**inputs, max_new_tokens=18)
+        input_length = inputs["input_ids"].shape[1]
+        generate_ids_without_inputs = generate_ids[:, input_length:]
 
-    def test_tokenizer_integration(self):
-        slow_tokenizer = AutoTokenizer.from_pretrained("liuhaotian/perception_lm-v1.6-34b", use_fast=False)
-        slow_tokenizer.add_tokens("<image>", True)
+        EXPECTED_DECODED_TEXT = ['The bar plot displays the values of four categories: step, horror, mood, and lumber', 'The video shows a group of people in green shirts and white shorts performing a dance routine on']  # fmt: skip
 
-        fast_tokenizer = AutoTokenizer.from_pretrained(
-            "liuhaotian/perception_lm-v1.6-34b",
-            bos_token="<|startoftext|>",
-            eos_token="<|endoftext|>",
-            from_slow=True,
-            legacy=False,
+        self.assertEqual(
+            processor.batch_decode(
+                generate_ids_without_inputs, skip_special_tokens=True
+            ),
+            EXPECTED_DECODED_TEXT,
         )
-        fast_tokenizer.add_tokens("<image>", True)
-
-        prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
-        EXPECTED_OUTPUT = ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '<image>', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n']  # fmt: skip
-        self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
-        self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
 
     @slow
     @require_bitsandbytes
     def test_generation_no_images(self):
-        model_id = "facebook/Perception-LM-1B"
-        model = PerceptionLMForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
+        # model_id = "facebook/Perception-LM-1B"
+        model = PerceptionLMForConditionalGeneration.from_pretrained(
+            TEST_MODEL_PATH, load_in_4bit=True
+        )
+        processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
 
         # Prepare inputs with no images
         inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device)
@@ -506,120 +427,3 @@ def test_generation_no_images(self):
         # Make sure that `generate` works
         _ = model.generate(**inputs, max_new_tokens=20)
 
-    @slow
-    @require_bitsandbytes
-    def test_generation_siglip_backbone(self):
-        model_id = "perception_lm-hf/perception_lm-interleave-qwen-0.5b-hf"
-        model = PerceptionLMForConditionalGeneration.from_pretrained(model_id, torch_dtype="float16", device_map=torch_device)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        # check processing with expansion of inputs (w/o expansion should work with any backbone)
-        processor.vision_feature_select_strategy = "default"
-        processor.patch_size = 14
-
-        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(
-            text="<|im_start|>user\n<image>\nWhat are these?<|im_end|>\n<|im_start|>assistant",
-            images=raw_image,
-            return_tensors="pt",
-        ).to(torch_device, torch.float16)
-
-        # Make sure that `generate` works
-        output = model.generate(**inputs, max_new_tokens=30)
-
-        EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat"
-        self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT)
-
-    @slow
-    def test_pixtral(self):
-        model_id = "mistral-community/pixtral-12b"
-        model = PerceptionLMForConditionalGeneration.from_pretrained(model_id)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        IMG_URLS = [
-            Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
-            Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
-            Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw),
-            Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw),
-        ]
-        PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
-
-        # image = Image.open(requests.get(url, stream=True).raw)
-        inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(model.device)
-        generate_ids = model.generate(**inputs, max_new_tokens=500)
-        ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        print(ouptut)
-
-        # fmt: off
-        EXPECTED_GENERATION = """
-Describe the images.
-Certainly! Here are the descriptions of the images:
-
-1. **Image 1**: This image features a black dog with a glossy coat sitting on a wooden surface. The dog has a calm and attentive expression, looking directly at the camera. The wooden background has a rustic appearance with visible grain and texture.
-
-2. **Image 2**: This image captures a breathtaking view of a mountainous landscape. The mountains are rugged and covered with patches of green vegetation. The sky above is clear, and the scene conveys a sense of tranquility and natural beauty.
-
-3. **Image 3**: This image shows a beach scene during sunset. The waves are gently rolling onto the shore, and several people can be seen in the water, possibly surfing or swimming. The sky is painted with warm hues of orange and yellow, creating a serene and picturesque atmosphere.
-
-4. **Image 4**: This image depicts a narrow, winding path that cuts through a lush, green landscape. On either side of the path, there is dense grass and various trees, including a prominent tree with white blossoms. The sky is clear and blue, adding to the peaceful and inviting ambiance of the scene.
-
-These descriptions provide a detailed overview of the content and atmosphere of each image.
-"""
-        # fmt: on
-        # check that both inputs are handled correctly and generate the same output
-        self.assertEqual(ouptut, EXPECTED_GENERATION)
-
-    @slow
-    @require_bitsandbytes
-    def test_pixtral_4bit(self):
-        model_id = "mistral-community/pixtral-12b"
-        model = PerceptionLMForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        IMG_URLS = [
-            Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
-            Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
-        ]
-        PROMPT = "<s>[INST][IMG][IMG]Describe the images.[/INST]"
-
-        inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(torch_device, torch.float16)
-        generate_ids = model.generate(**inputs, max_new_tokens=50)
-        output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-
-        EXPECTED_GENERATION = [
-            # CUDA output
-            "Describe the images. The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador",
-            # XPU output
-            "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which covers the entire background. The dog appears to be the main focus",
-        ]  # fmt: skip
-        self.assertTrue(output in EXPECTED_GENERATION)
-
-    @slow
-    @require_bitsandbytes
-    def test_pixtral_batched(self):
-        model_id = "mistral-community/pixtral-12b"
-        model = PerceptionLMForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-        processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
-
-        IMG_URLS = [
-            Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
-            Image.open(requests.get("https://picsum.photos/id/17/150/500", stream=True).raw),
-        ]
-        PROMPT = [
-            "<s>[INST][IMG]What breed is the dog?[/INST]",
-            "<s>[INST][IMG]What is shown in this image?[/INST]",
-        ]
-
-        inputs = processor(text=PROMPT, images=IMG_URLS, padding=True, return_tensors="pt").to(
-            torch_device, torch.float16
-        )
-        generate_ids = model.generate(**inputs, max_new_tokens=50)
-        output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-
-        EXPECTED_GENERATION = [
-            'What breed is the dog?The dog in the image is a black Labrador Retriever.',
-            'What is shown in this image?The image depicts a narrow, winding dirt path surrounded by lush greenery. The path is flanked by grass and shrubs on both sides. On the left side, there are tall trees and dense foliage, while on the right side, there'
-        ]  # fmt: skip
-        self.assertEqual(output, EXPECTED_GENERATION)

From f1338b145d8a87b0a7341b745087acaabfc12aca Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Thu, 12 Jun 2025 21:16:35 +0000
Subject: [PATCH 26/65] Image processing refactoring to use more common parts.

---
 .../models/auto/tokenization_auto.py          |   6 -
 .../convert_perception_lm_weights_to_hf.py    |   7 +-
 .../image_processing_perception_lm_fast.py    |  40 ++++--
 .../models/perception_lm/image_transform.py   |  44 +-----
 .../perception_lm/processing_perception_lm.py |  18 +--
 .../test_processor_perception_lm.py           | 129 ++++++++++++------
 6 files changed, 132 insertions(+), 112 deletions(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index ebbc3ad2c3f2..c8656a710746 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -464,12 +464,6 @@
                 None,
             ),
         ),
-        (
-            "perception_lm", 
-            (
-                "PerceptionLMTokenizer", "PerceptionLMTokenizerFast" if is_tokenizers_available() else None
-            )
-        ),
         (
             "persimmon",
             (
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index b7ee4671753a..441b8cb217eb 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -553,20 +553,19 @@ def write_tokenizer(
         "patch_size": params["model"]["vision_model"]["patch_size"],
         "processor_class": "PerceptionLMProcessor",
     }
-    image_res = params["model"]["vision_model"]["image_size"]
+    tile_size = params["model"]["vision_model"]["image_size"]
 
     image_preprocessor_config = {
         "image_processor_type": "PerceptionLMImageProcessorFast",
         "vision_input_type": params["data"]["vision_input_type"],
-        "image_res": image_res,
+        "tile_size": tile_size,
         "max_num_tiles": params["data"]["max_num_tiles"],
         "max_frame_tiles": 1,
-        "normalize_img": True,
     }
     image_preprocessor = PerceptionLMImageProcessorFast(**image_preprocessor_config)
     video_preprocessor_config = {
         "video_processor_type": "PerceptionLMVideoProcessor",
-        "size": {"height": image_res, "width": image_res},
+        "size": {"height": tile_size, "width": tile_size},
     }
     video_preprocessor = PerceptionLMVideoProcessor(**video_preprocessor_config)
     processor = PerceptionLMProcessor(
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 8eef962bf4a3..cd040581c90a 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -33,33 +33,37 @@
     add_start_docstrings,
     is_torch_available,
 )
+from ...image_utils import PILImageResampling, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
 
 if is_torch_available():
     import torch
 
 class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     vision_input_type: str = "thumb+tile"
-    image_res: int = 448
     max_num_tiles: int = 36
-    normalize_img: bool = True
+
 
 @add_start_docstrings(
     "Constructs a fast PerceptionLM image processor.",
-    """
-        do_pad (`bool`, *optional*, defaults to `self.do_pad`):
-            Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
-    """,
 )
 class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    do_resize = True
+    do_center_crop = False
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
     valid_kwargs = PerceptionLMFastImageProcessorKwargs
 
+
     def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> None:
         super().__init__(**kwargs)
         self.image_transform = get_image_transform(
             vision_input_type=kwargs.get("vision_input_type", "thumb+tile"),
-            image_res=kwargs.get("image_res", 448),
+            image_res=kwargs.get("tile_size", 448),
             max_num_tiles=kwargs.get("max_num_tiles", 36),
-            normalize_img=kwargs.get("normalize_img", True),
         )
 
     def to_dict(self):
@@ -70,6 +74,12 @@ def to_dict(self):
     def _preprocess(
         self,
         images: List["torch.Tensor"],
+        do_resize: bool,
+        do_rescale: Optional[bool],
+        rescale_factor: Optional[Union[int, float]],
+        do_normalize: Optional[bool],
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
         return_tensors: Optional[Union[str, TensorType]],
         **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]
     ) -> BatchFeature:
@@ -77,11 +87,23 @@ def _preprocess(
         del kwargs
         if images:
             grouped_images, grouped_images_index = group_images_by_shape(images)
+            resized_images_grouped = {}
+            for shape, stacked_images in grouped_images.items():
+                if do_resize:
+                    stacked_images, _ = self.image_transform(stacked_images)
+                resized_images_grouped[shape] = stacked_images
+            resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+                
+            grouped_images, grouped_images_index = group_images_by_shape(resized_images)
             processed_images_grouped = {}
             for shape, stacked_images in grouped_images.items():
-                stacked_images, _ = self.image_transform(stacked_images)
+                # Fused rescale and normalize
+                stacked_images = self.rescale_and_normalize(
+                    stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+                )
                 processed_images_grouped[shape] = stacked_images
             processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+            
             processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
             return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
         else:
diff --git a/src/transformers/models/perception_lm/image_transform.py b/src/transformers/models/perception_lm/image_transform.py
index 3ec7430cd907..e2bf0edb4d72 100644
--- a/src/transformers/models/perception_lm/image_transform.py
+++ b/src/transformers/models/perception_lm/image_transform.py
@@ -5,7 +5,6 @@
 from logging import getLogger
 from typing import Any, Callable, Tuple, Union, Sequence
 
-import numpy as np
 import torch
 import torchvision.transforms as tv
 from PIL import Image
@@ -17,10 +16,6 @@
 logger = getLogger()
 
 
-MEAN = (0.5, 0.5, 0.5)
-STD = (0.5, 0.5, 0.5)
-
-
 """
 Resize the image to the given size. Supports both PIL images and torch.Tensor.
 If the image is a tensor, it's supposed to be a batch of images with shape (B, C, H, W) and dtype uint8.
@@ -48,20 +43,17 @@ def get_image_transform(
     vision_input_type: str = "vanilla",
     image_res: int = 336,
     max_num_tiles: int = 1,
-    normalize_img: bool = True,
 ) -> Tuple[Callable, int]:
 
     if vision_input_type == "thumb+tile":
         transforms = VariableSizeImageTransform(
             size=image_res,
             max_num_tiles=max_num_tiles,
-            normalize_img=normalize_img,
             use_thumbnail="before",
         )
     else:
         transforms = ImageTransform(
             size=image_res,
-            normalize_img=normalize_img,
         )
 
     logger.info(
@@ -73,36 +65,19 @@ def get_image_transform(
 
 class ImageTransform(object):
     """
-    Image transform will resize the longer edge to a given size and pad the shorter edge with mean pixel value of the image.
+    Vanilla Image transform.
     """
-
     def __init__(
         self,
         size: int = 336,
-        normalize_img: bool = True,
     ) -> None:
         self.size = size
-        self._mean = MEAN
-        self._std = STD
-        self.normalize_img = normalize_img
-
         logger.info(f"ImageTransform size: {self.size}")
-
         self.to_tensor = tv.ToTensor()
-        self.normalize = (
-            tv.Normalize(
-                mean=self._mean,
-                std=self._std,
-                inplace=True,
-            )
-            if normalize_img
-            else lambda x: x
-        )
 
     def to_dict(self):
         return {
             "size": self.size,
-            "normalize_img": self.normalize_img,
         }
 
     def __call__(self, image: Union[Image.Image, torch.Tensor]):
@@ -120,7 +95,6 @@ def __call__(self, image: Union[Image.Image, torch.Tensor]):
             image = self.to_tensor(image)
         else:
             image = F.convert_image_dtype(image, torch.float32)
-        image = self.normalize(image)
 
         # Add chunk dim to make it compatible with existing dataloaders
         image = image.view(1, -1, 3, self.size, self.size)
@@ -154,41 +128,26 @@ class VariableSizeImageTransform(object):
     def __init__(
         self,
         size: int = 336,
-        normalize_img: bool = True,
         max_num_tiles: int = 1,
         use_thumbnail: str = "no",
         area_limit: bool = False,
     ) -> None:
         self.size = size
-        self._mean = MEAN
-        self._std = STD
 
         logger.info(f"VariableSizeImageTransform size: {self.size}")
 
         self.to_tensor = tv.ToTensor()
-        self.normalize = (
-            tv.Normalize(
-                mean=self._mean,
-                std=self._std,
-                inplace=True,
-            )
-            if normalize_img
-            else lambda x: x
-        )
         self.area_limit = area_limit
         self.max_num_tiles = max_num_tiles
         self.use_thumbnail = use_thumbnail
-        self.normalize_img = normalize_img
         if self.use_thumbnail != "no":
             self.thumbnail_transform = ImageTransform(
                 size=self.size,
-                normalize_img=normalize_img,
             )
 
     def to_dict(self):
         return {
             "size": self.size,
-            "normalize_img": self.normalize_img,
             "max_num_tiles": self.max_num_tiles,
             "use_thumbnail": self.use_thumbnail,
         }
@@ -435,7 +394,6 @@ def __call__(
         else:
             image = F.convert_image_dtype(image, torch.float32)
 
-        image = self.normalize(image)
         image = self._split(image, ar[0], ar[1])  # type: ignore
         if self.use_thumbnail == "before":
             image = torch.cat((thumbnail, image), dim=1)
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 5b8b6b70eb04..c20a0739a963 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -20,13 +20,12 @@
 from typing import List, Union, Iterable
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput, get_image_size, to_numpy_array
+from ...image_utils import ImageInput, PILImageResampling, get_image_size, to_numpy_array, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
 from ...video_utils import VideoInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
 
-
 logger = logging.get_logger(__name__)
 
 
@@ -35,7 +34,15 @@ class PerceptionLMProcessorKwargs(ProcessingKwargs, total=False):
         "text_kwargs": {
             "padding": False,
         },
-        "images_kwargs": {},
+        "images_kwargs": {
+            "do_resize": True,
+            "do_rescale": False,
+            "do_normalize": True,
+            "size": 448,
+            "resample": PILImageResampling.BICUBIC,
+            "image_mean": IMAGENET_STANDARD_MEAN,
+            "image_std": IMAGENET_STANDARD_STD,
+        },
     }
 
 
@@ -83,16 +90,11 @@ def __init__(
         image_processor=None,
         tokenizer=None,
         patch_size=None,
-        vision_feature_select_strategy=None,
         chat_template=None,
-        media_token="<|image|>", 
-        num_additional_image_tokens=0,
         pooling_ratio=2,
         **kwargs,
     ):
         self.patch_size = patch_size
-        self.num_additional_image_tokens = num_additional_image_tokens
-        self.vision_feature_select_strategy = vision_feature_select_strategy
         self.pooling_ratio = pooling_ratio
         self.image_token = tokenizer.image_token
         self.video_token = tokenizer.video_token
diff --git a/tests/models/perception_lm/test_processor_perception_lm.py b/tests/models/perception_lm/test_processor_perception_lm.py
index c9c3b47f6593..52d031fffd35 100644
--- a/tests/models/perception_lm/test_processor_perception_lm.py
+++ b/tests/models/perception_lm/test_processor_perception_lm.py
@@ -16,7 +16,12 @@
 import tempfile
 import unittest
 
-from transformers import AutoProcessor, AutoTokenizer, PerceptionLMTokenizerFast, PerceptionLMProcessor
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    LlamaTokenizerFast,
+    PerceptionLMProcessor,
+)
 from transformers.testing_utils import require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -24,10 +29,13 @@
 
 
 if is_vision_available():
-    from transformers import CLIPImageProcessor
+    from transformers import PerceptionLMImageProcessorFast, PerceptionLMVideoProcessor
 
-if is_torch_available:
-    pass
+if is_torch_available():
+    import torch
+
+
+TEST_MODEL_PATH = "/checkpoint/vision_encoder/smhu/debug/plm_hf_1b"
 
 
 @require_vision
@@ -38,13 +46,22 @@ class PerceptionLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def setUpClass(cls):
         cls.tmpdirname = tempfile.mkdtemp()
 
-        image_processor = CLIPImageProcessor(do_center_crop=False)
-        tokenizer = PerceptionLMTokenizerFast.from_pretrained("huggyllama/llama-7b")
-        tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
+        image_processor = PerceptionLMImageProcessorFast()
+        video_processor = PerceptionLMVideoProcessor()
+        tokenizer = LlamaTokenizerFast.from_pretrained(TEST_MODEL_PATH)
+        tokenizer.add_special_tokens(
+            {"additional_special_tokens": ["<|image|>", "<|video|>"]}
+        )
         processor_kwargs = cls.prepare_processor_dict()
-        processor = PerceptionLMProcessor(image_processor, tokenizer, **processor_kwargs)
+        processor = PerceptionLMProcessor(
+            image_processor=image_processor,
+            video_processor=video_processor,
+            tokenizer=tokenizer,
+            **processor_kwargs
+        )
         processor.save_pretrained(cls.tmpdirname)
-        cls.image_token = processor.image_token
+        cls.image_token_id = processor.image_token_id
+        cls.video_token_id = processor.video_token_id
 
     def get_tokenizer(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -59,9 +76,10 @@ def tearDownClass(cls):
     @staticmethod
     def prepare_processor_dict():
         return {
-            "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
-            "patch_size": 128,
-            "vision_feature_select_strategy": "default"
+            "chat_template": CHAT_TEMPLATE,
+            "pooling_ratio": 1,
+            "patch_size": 14,
+            "processor_class": "PerceptionLMProcessor",
         }  # fmt: skip
 
     def test_chat_template_is_saved(self):
@@ -73,36 +91,63 @@ def test_chat_template_is_saved(self):
         # they have to be saved as separate file and loaded back from that file
         # so we check if the same template is loaded
         processor_dict = self.prepare_processor_dict()
-        self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
-
-    def test_can_load_various_tokenizers(self):
-        for checkpoint in ["Intel/perception_lm-gemma-2b", "facebook/Perception-LM-1B"]:
-            processor = PerceptionLMProcessor.from_pretrained(checkpoint)
-            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-            self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)
-
-    def test_special_mm_token_truncation(self):
-        """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
-
-        processor = PerceptionLMProcessor.from_pretrained("facebook/Perception-LM-1B")
-
-        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
-        image_input = self.prepare_image_inputs(batch_size=2)
+        self.assertTrue(
+            processor_loaded.chat_template == processor_dict.get("chat_template", None)
+        )
 
-        _ = processor(
-            text=input_str,
-            images=image_input,
+    def test_image_token_filling(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
+        processor.patch_size = 14
+        processor.image_processor.tile_size = 448
+        processor.image_processor.max_num_tiles = 36
+        processor.image_processor.vision_input_type = "thumb+tile"
+        # Important to check with non square image
+        image = torch.randint(0, 2, (1, 3, 503, 316))
+        expected_image_tokens = 1525
+        image_token_index = processor.image_token_id
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        inputs = processor(
+            text=[processor.apply_chat_template(messages)],
+            images=[image],
             return_tensors="pt",
-            truncation=None,
-            padding=True,
         )
-
-        with self.assertRaises(ValueError):
-            _ = processor(
-                text=input_str,
-                images=image_input,
-                return_tensors="pt",
-                truncation=True,
-                padding=True,
-                max_length=5,
-            )
+        image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
+        self.assertEqual(expected_image_tokens, image_tokens)
+
+CHAT_TEMPLATE = (
+    "{{- bos_token }}"
+    "{%- if messages[0]['role'] == 'system' -%}"
+    "    {%- set system_message = messages[0]['content']|trim %}\n"
+    "    {%- set messages = messages[1:] %}\n"
+    "{%- else %}"
+    "    {%- set system_message = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.' %}"
+    "{%- endif %}"
+    "{{- '<|start_header_id|>system<|end_header_id|>\\n\\n' }}"
+    "{{- system_message }}"
+    "{{- '<|eot_id|>' }}"
+    "{%- for message in messages %}"
+    "{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' }}"
+    "{%- for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
+    "{{ '<|image|>' }}"
+    "{%- endfor %}"
+    "{%- for content in message['content'] | selectattr('type', 'equalto', 'video') %}"
+    "{{ '<|video|>' }}"
+    "{%- endfor %}"
+    "{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
+    "{{- content['text'] | trim }}"
+    "{%- endfor %}"
+    "{{'<|eot_id|>' }}"
+    "{%- endfor %}"
+    "{%- if add_generation_prompt %}"
+    "{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}"
+    "{%- endif %}"
+)
\ No newline at end of file

From 7a28970e7bb2df04d37b2dc7da734669f5f12ada Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Thu, 12 Jun 2025 22:48:00 +0000
Subject: [PATCH 27/65] Fix processor test.

---
 .../image_processing_perception_lm_fast.py    |  7 ++++---
 .../perception_lm/processing_perception_lm.py |  1 -
 .../test_processor_perception_lm.py           | 20 +++++++++----------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index cd040581c90a..c929f23e3223 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -40,6 +40,7 @@
 
 class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     vision_input_type: str = "thumb+tile"
+    tile_size: int = 448
     max_num_tiles: int = 36
 
 
@@ -61,9 +62,9 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
     def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> None:
         super().__init__(**kwargs)
         self.image_transform = get_image_transform(
-            vision_input_type=kwargs.get("vision_input_type", "thumb+tile"),
-            image_res=kwargs.get("tile_size", 448),
-            max_num_tiles=kwargs.get("max_num_tiles", 36),
+            vision_input_type=self.vision_input_type,
+            image_res=self.tile_size,
+            max_num_tiles=self.max_num_tiles,
         )
 
     def to_dict(self):
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index c20a0739a963..79734f79dafd 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -76,7 +76,6 @@ class PerceptionLMProcessor(ProcessorMixin):
     valid_kwargs = [
         "chat_template",
         "patch_size",
-        "vision_feature_select_strategy",
         "image_token",
         "num_additional_image_tokens",
     ]
diff --git a/tests/models/perception_lm/test_processor_perception_lm.py b/tests/models/perception_lm/test_processor_perception_lm.py
index 52d031fffd35..5d108e7e3cba 100644
--- a/tests/models/perception_lm/test_processor_perception_lm.py
+++ b/tests/models/perception_lm/test_processor_perception_lm.py
@@ -19,7 +19,6 @@
 from transformers import (
     AutoProcessor,
     AutoTokenizer,
-    LlamaTokenizerFast,
     PerceptionLMProcessor,
 )
 from transformers.testing_utils import require_vision
@@ -46,9 +45,11 @@ class PerceptionLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def setUpClass(cls):
         cls.tmpdirname = tempfile.mkdtemp()
 
-        image_processor = PerceptionLMImageProcessorFast()
+        image_processor = PerceptionLMImageProcessorFast(
+            tile_size=448, max_num_tiles=4, vision_input_type="thumb+tile"
+        )
         video_processor = PerceptionLMVideoProcessor()
-        tokenizer = LlamaTokenizerFast.from_pretrained(TEST_MODEL_PATH)
+        tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL_PATH)
         tokenizer.add_special_tokens(
             {"additional_special_tokens": ["<|image|>", "<|video|>"]}
         )
@@ -77,8 +78,8 @@ def tearDownClass(cls):
     def prepare_processor_dict():
         return {
             "chat_template": CHAT_TEMPLATE,
-            "pooling_ratio": 1,
             "patch_size": 14,
+            "pooling_ratio": 2,
             "processor_class": "PerceptionLMProcessor",
         }  # fmt: skip
 
@@ -97,13 +98,11 @@ def test_chat_template_is_saved(self):
 
     def test_image_token_filling(self):
         processor = self.processor_class.from_pretrained(self.tmpdirname)
-        processor.patch_size = 14
-        processor.image_processor.tile_size = 448
-        processor.image_processor.max_num_tiles = 36
-        processor.image_processor.vision_input_type = "thumb+tile"
         # Important to check with non square image
-        image = torch.randint(0, 2, (1, 3, 503, 316))
-        expected_image_tokens = 1525
+        image = torch.randn((1, 3, 450, 500))
+        #  5 tiles (thumbnail tile + 4 tiles)
+        #  448/patch_size/pooling_ratio = 16 => 16*16 tokens per tile
+        expected_image_tokens = 16 * 16 * 5
         image_token_index = processor.image_token_id
 
         messages = [
@@ -123,6 +122,7 @@ def test_image_token_filling(self):
         image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
         self.assertEqual(expected_image_tokens, image_tokens)
 
+
 CHAT_TEMPLATE = (
     "{{- bos_token }}"
     "{%- if messages[0]['role'] == 'system' -%}"

From 642d2e73e09c0de529f43051c7a713ad96e8173a Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 13 Jun 2025 04:39:12 +0000
Subject: [PATCH 28/65] update tests to use model from hub

---
 .../test_image_processing_perception_lm.py    | 246 +++++++++---------
 .../test_modeling_perception_lm.py            |   4 +-
 .../test_processor_perception_lm.py           |   3 +-
 .../test_video_processing_perception_lm.py    | 127 +++++++++
 4 files changed, 256 insertions(+), 124 deletions(-)
 create mode 100644 tests/models/perception_lm/test_video_processing_perception_lm.py

diff --git a/tests/models/perception_lm/test_image_processing_perception_lm.py b/tests/models/perception_lm/test_image_processing_perception_lm.py
index ec5663439472..c540f1df458b 100644
--- a/tests/models/perception_lm/test_image_processing_perception_lm.py
+++ b/tests/models/perception_lm/test_image_processing_perception_lm.py
@@ -1,4 +1,4 @@
-# Copyright 2025 HuggingFace Inc.
+# Copyright 2024 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,29 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import unittest
-from typing import Union
 
 import numpy as np
 
+from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ChannelDimension
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torchvision_available, is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
 
+if is_torch_available():
+    import torch
+
 if is_vision_available():
     from PIL import Image
 
-    from transformers import PerceptionLMImageProcessor
-
     if is_torchvision_available():
-        from torchvision.transforms import functional as F
-
         from transformers import PerceptionLMImageProcessorFast
 
-
 class PerceptionLMImageProcessingTester:
     def __init__(
         self,
@@ -44,30 +41,23 @@ def __init__(
         image_size=18,
         min_resolution=30,
         max_resolution=400,
-        do_pad=True,
         do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
+        tile_size=None,
         do_normalize=True,
-        image_mean=[0.48145466, 0.4578275, 0.40821073],
-        image_std=[0.26862954, 0.26130258, 0.27577711],
+        image_mean=IMAGENET_STANDARD_MEAN,
+        image_std=IMAGENET_STANDARD_STD,
         do_convert_rgb=True,
     ):
         super().__init__()
         size = size if size is not None else {"shortest_edge": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
         self.parent = parent
         self.batch_size = batch_size
         self.num_channels = num_channels
         self.image_size = image_size
         self.min_resolution = min_resolution
         self.max_resolution = max_resolution
-        self.do_pad = do_pad
         self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
+        self.tile_size = size
         self.do_normalize = do_normalize
         self.image_mean = image_mean
         self.image_std = image_std
@@ -75,11 +65,8 @@ def __init__(
 
     def prepare_image_processor_dict(self):
         return {
-            "do_pad": self.do_pad,
             "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
+            "tile_size": self.size,
             "do_normalize": self.do_normalize,
             "image_mean": self.image_mean,
             "image_std": self.image_std,
@@ -89,6 +76,7 @@ def prepare_image_processor_dict(self):
     def expected_output_image_shape(self, images):
         return self.num_channels, self.crop_size["height"], self.crop_size["width"]
 
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
     def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
         return prepare_image_inputs(
             batch_size=self.batch_size,
@@ -103,33 +91,30 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
-# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest with CLIP->PerceptionLM
 class PerceptionLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = PerceptionLMImageProcessor if is_vision_available() else None
     fast_image_processing_class = PerceptionLMImageProcessorFast if is_torchvision_available() else None
 
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaNext
     def setUp(self):
         super().setUp()
         self.image_processor_tester = PerceptionLMImageProcessingTester(self)
 
     @property
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
     def image_processor_dict(self):
         return self.image_processor_tester.prepare_image_processor_dict()
 
-    # Ignore copy
     def test_image_processor_properties(self):
         for image_processing_class in self.image_processor_list:
             image_processing = image_processing_class(**self.image_processor_dict)
-            self.assertTrue(hasattr(image_processing, "do_pad"))
             self.assertTrue(hasattr(image_processing, "do_resize"))
-            self.assertTrue(hasattr(image_processing, "size"))
-            self.assertTrue(hasattr(image_processing, "do_center_crop"))
-            self.assertTrue(hasattr(image_processing, "center_crop"))
+            self.assertTrue(hasattr(image_processing, "tile_size"))
             self.assertTrue(hasattr(image_processing, "do_normalize"))
             self.assertTrue(hasattr(image_processing, "image_mean"))
             self.assertTrue(hasattr(image_processing, "image_std"))
             self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
 
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs
     def test_image_processor_from_dict_with_kwargs(self):
         for image_processing_class in self.image_processor_list:
             image_processor = image_processing_class.from_dict(self.image_processor_dict)
@@ -140,97 +125,120 @@ def test_image_processor_from_dict_with_kwargs(self):
             self.assertEqual(image_processor.size, {"shortest_edge": 42})
             self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
 
-    # Ignore copy
-    def test_padding(self):
-        """
-        LLaVA needs to pad images to square size before processing as per orig implementation.
-        Checks that image processor pads images correctly given different background colors.
-        """
-
-        # taken from original implementation: https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/perception_lm/mm_utils.py#L152
-        def pad_to_square_original(
-            image: Image.Image, background_color: Union[int, tuple[int, int, int]] = 0
-        ) -> Image.Image:
-            width, height = image.size
-            if width == height:
-                return image
-            elif width > height:
-                result = Image.new(image.mode, (width, width), background_color)
-                result.paste(image, (0, (width - height) // 2))
-                return result
-            else:
-                result = Image.new(image.mode, (height, height), background_color)
-                result.paste(image, ((height - width) // 2, 0))
-                return result
+    def test_call_pil(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random PIL images
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
 
-        for i, image_processing_class in enumerate(self.image_processor_list):
-            image_processor = image_processing_class.from_dict(self.image_processor_dict)
-            numpify = i == 0
-            torchify = i == 1
-            image_inputs = self.image_processor_tester.prepare_image_inputs(
-                equal_resolution=False, numpify=numpify, torchify=torchify
-            )
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = (1, 1445, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
 
-            # test with images in channel-last and channel-first format (only channel-first for torch)
-            for image in image_inputs:
-                padded_image = image_processor.pad_to_square(image)
-                if i == 0:
-                    padded_image_original = pad_to_square_original(Image.fromarray(image))
-                    padded_image_original = np.array(padded_image_original)
-
-                    np.testing.assert_allclose(padded_image, padded_image_original)
-
-                    padded_image = image_processor.pad_to_square(
-                        image.transpose(2, 0, 1), input_data_format="channels_first"
-                    )
-                    padded_image = padded_image.transpose(1, 2, 0)
-
-                    np.testing.assert_allclose(padded_image, padded_image_original)
-                else:
-                    padded_image_original = pad_to_square_original(F.to_pil_image(image))
-                    padded_image = padded_image.permute(1, 2, 0)
-                    np.testing.assert_allclose(padded_image, padded_image_original)
-
-            # test background color
-            background_color = (122, 116, 104)
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 1445, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+    def test_call_numpy(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random numpy tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
             for image in image_inputs:
-                padded_image = image_processor.pad_to_square(image, background_color=background_color)
-                if i == 0:
-                    padded_image_original = pad_to_square_original(
-                        Image.fromarray(image), background_color=background_color
-                    )
-                else:
-                    padded_image_original = pad_to_square_original(
-                        F.to_pil_image(image), background_color=background_color
-                    )
-                    padded_image = padded_image.permute(1, 2, 0)
-                padded_image_original = np.array(padded_image_original)
-
-                np.testing.assert_allclose(padded_image, padded_image_original)
-
-            background_color = 122
+                self.assertIsInstance(image, np.ndarray)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = (1, 1445, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 1445, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+    def test_call_pytorch(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random PyTorch tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
+
             for image in image_inputs:
-                padded_image = image_processor.pad_to_square(image, background_color=background_color)
-                if i == 0:
-                    padded_image_original = pad_to_square_original(
-                        Image.fromarray(image), background_color=background_color
-                    )
-                else:
-                    padded_image_original = pad_to_square_original(
-                        F.to_pil_image(image), background_color=background_color
-                    )
-                    padded_image = padded_image.permute(1, 2, 0)
-                padded_image_original = np.array(padded_image_original)
-                np.testing.assert_allclose(padded_image, padded_image_original)
-
-            # background color length should match channel length
-            with self.assertRaises(ValueError):
-                padded_image = image_processor.pad_to_square(image_inputs[0], background_color=(122, 104))
-
-            with self.assertRaises(ValueError):
-                padded_image = image_processor.pad_to_square(image_inputs[0], background_color=(122, 104, 0, 0))
-
-    @unittest.skip(reason="PerceptionLM does not support 4 channel images yet")
-    # Ignore copy
+                self.assertIsInstance(image, torch.Tensor)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = (1, 1445, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 1445, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+    @unittest.skip(
+        reason="LlavaNextImageProcessor doesn't treat 4 channel PIL and numpy consistently yet"
+    )  # FIXME Amy
     def test_call_numpy_4_channels(self):
         pass
+
+    def test_nested_input(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+
+            # Test batched as a list of images
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 1445, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            # Test batched as a nested list of images, where each sublist is one batch
+            image_inputs_nested = [image_inputs[:3], image_inputs[3:]]
+            encoded_images_nested = image_processing(image_inputs_nested, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 1445, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
+
+            # Image processor should return same pixel values, independently of ipnut format
+            self.assertTrue((encoded_images_nested == encoded_images).all())
+
+    def test_pad_for_patching(self):
+        for image_processing_class in self.image_processor_list:
+            if image_processing_class == self.fast_image_processing_class:
+                numpify = False
+                torchify = True
+                input_data_format = image_processing_class.data_format
+            else:
+                numpify = True
+                torchify = False
+                input_data_format = ChannelDimension.LAST
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # Create odd-sized images
+            image_input = self.image_processor_tester.prepare_image_inputs(
+                equal_resolution=True,
+                numpify=numpify,
+                torchify=torchify,
+            )[0]
+            self.assertIn(image_input.shape, [(3, 400, 400), (400, 400, 3)])
+
+            # Test odd-width
+            image_shape = (400, 601)
+            encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
+            encoded_image_shape = (
+                encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
+            )
+            self.assertEqual(encoded_image_shape, image_shape)
+
+            # Test odd-height
+            image_shape = (503, 400)
+            encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
+            encoded_image_shape = (
+                encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
+            )
+            self.assertEqual(encoded_image_shape, image_shape)
\ No newline at end of file
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index 8d9d2b0ef144..b9a46d82f2c7 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -306,9 +306,7 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
         pass
 
 
-TEST_MODEL_PATH = "/checkpoint/vision_encoder/smhu/debug/plm_hf_1b"
-
-
+TEST_MODEL_PATH = "shumingh/plm_1b_hf"
 @require_torch
 class PerceptionLMForConditionalGenerationIntegrationTest(unittest.TestCase):
     def setUp(self):
diff --git a/tests/models/perception_lm/test_processor_perception_lm.py b/tests/models/perception_lm/test_processor_perception_lm.py
index 5d108e7e3cba..f03897109132 100644
--- a/tests/models/perception_lm/test_processor_perception_lm.py
+++ b/tests/models/perception_lm/test_processor_perception_lm.py
@@ -34,8 +34,7 @@
     import torch
 
 
-TEST_MODEL_PATH = "/checkpoint/vision_encoder/smhu/debug/plm_hf_1b"
-
+TEST_MODEL_PATH = "shumingh/plm_1b_hf"
 
 @require_vision
 class PerceptionLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
diff --git a/tests/models/perception_lm/test_video_processing_perception_lm.py b/tests/models/perception_lm/test_video_processing_perception_lm.py
new file mode 100644
index 000000000000..74d3cdb76b3a
--- /dev/null
+++ b/tests/models/perception_lm/test_video_processing_perception_lm.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    pass
+
+if is_vision_available():
+    if is_torchvision_available():
+        from transformers import PerceptionLMVideoProcessor
+
+
+class PerceptionLMVideoProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=5,
+        num_frames=8,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=80,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=IMAGENET_STANDARD_MEAN,
+        image_std=IMAGENET_STANDARD_STD,
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"height": 20, "width": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_video_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_video_shape(self, images):
+        return self.num_frames, self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
+        videos = prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_frames=self.num_frames,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            return_tensors=return_tensors,
+        )
+        return videos
+
+
+@require_torch
+@require_vision
+class PerceptionLMVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
+    fast_video_processing_class = PerceptionLMVideoProcessor if is_torchvision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.video_processor_tester = PerceptionLMVideoProcessingTester(self)
+
+    @property
+    def video_processor_dict(self):
+        return self.video_processor_tester.prepare_video_processor_dict()
+
+    def test_video_processor_properties(self):
+        video_processing = self.fast_video_processing_class(**self.video_processor_dict)
+        self.assertTrue(hasattr(video_processing, "do_resize"))
+        self.assertTrue(hasattr(video_processing, "size"))
+        self.assertTrue(hasattr(video_processing, "do_center_crop"))
+        self.assertTrue(hasattr(video_processing, "center_crop"))
+        self.assertTrue(hasattr(video_processing, "do_normalize"))
+        self.assertTrue(hasattr(video_processing, "image_mean"))
+        self.assertTrue(hasattr(video_processing, "image_std"))
+        self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
+
+    def test_video_processor_from_dict_with_kwargs(self):
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
+        self.assertEqual(video_processor.size, {"height": 20, "width": 20})
+        self.assertEqual(video_processor.crop_size, {"height": 18, "width": 18})
+
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42, crop_size=84)
+        self.assertEqual(video_processor.size, {'height': 42, 'width': 42})
+        self.assertEqual(video_processor.crop_size, {"height": 84, "width": 84})

From 2fadbae655c847e089ad68b3f02dfc37cd806311 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 13 Jun 2025 07:04:56 +0000
Subject: [PATCH 29/65] More test fixes.

---
 .../models/perception_lm/image_transform.py   |  2 +-
 .../perception_lm/modular_perception_lm.py    |  1 -
 .../test_image_processing_perception_lm.py    | 92 ++++++++-----------
 .../test_processor_perception_lm.py           |  1 -
 4 files changed, 37 insertions(+), 59 deletions(-)

diff --git a/src/transformers/models/perception_lm/image_transform.py b/src/transformers/models/perception_lm/image_transform.py
index e2bf0edb4d72..ebaa1e15fbb9 100644
--- a/src/transformers/models/perception_lm/image_transform.py
+++ b/src/transformers/models/perception_lm/image_transform.py
@@ -97,7 +97,7 @@ def __call__(self, image: Union[Image.Image, torch.Tensor]):
             image = F.convert_image_dtype(image, torch.float32)
 
         # Add chunk dim to make it compatible with existing dataloaders
-        image = image.view(1, -1, 3, self.size, self.size)
+        image = image.unsqueeze(-4)
         return image, (w, h)
 
 
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index bf7ebfc84817..00c83fc37b84 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -75,7 +75,6 @@ def __init__(self, config: PerceptionEncoderConfig):
             ref_feat_shape=config.ref_feat_shape,
             embed_dim=config.width,
         )
-        self.eva_pe._initialize_weights = lambda x: x  # disable weight initialization
 
     def forward(self, x):
         x = self.eva_pe(x)
diff --git a/tests/models/perception_lm/test_image_processing_perception_lm.py b/tests/models/perception_lm/test_image_processing_perception_lm.py
index c540f1df458b..a8d609367740 100644
--- a/tests/models/perception_lm/test_image_processing_perception_lm.py
+++ b/tests/models/perception_lm/test_image_processing_perception_lm.py
@@ -42,14 +42,17 @@ def __init__(
         min_resolution=30,
         max_resolution=400,
         do_resize=True,
-        tile_size=None,
+        tile_size=16,
         do_normalize=True,
         image_mean=IMAGENET_STANDARD_MEAN,
         image_std=IMAGENET_STANDARD_STD,
         do_convert_rgb=True,
+        max_num_tiles=4,
+        vision_input_type="thumb+tile",
+        resample=Image.Resampling.BICUBIC, # dummy value
+        size = {"shortest_edge": 20}, # dummy value
     ):
         super().__init__()
-        size = size if size is not None else {"shortest_edge": 20}
         self.parent = parent
         self.batch_size = batch_size
         self.num_channels = num_channels
@@ -57,20 +60,28 @@ def __init__(
         self.min_resolution = min_resolution
         self.max_resolution = max_resolution
         self.do_resize = do_resize
-        self.tile_size = size
+        self.tile_size = tile_size
         self.do_normalize = do_normalize
         self.image_mean = image_mean
         self.image_std = image_std
         self.do_convert_rgb = do_convert_rgb
+        self.max_num_tiles = max_num_tiles
+        self.vision_input_type = vision_input_type
+        self.resample = resample
+        self.size = size
 
     def prepare_image_processor_dict(self):
         return {
             "do_resize": self.do_resize,
-            "tile_size": self.size,
+            "tile_size": self.tile_size,
             "do_normalize": self.do_normalize,
             "image_mean": self.image_mean,
             "image_std": self.image_std,
             "do_convert_rgb": self.do_convert_rgb,
+            "max_num_tiles": self.max_num_tiles,
+            "vision_input_type": self.vision_input_type,
+            "resample": self.resample,
+            "size": self.size,
         }
 
     def expected_output_image_shape(self, images):
@@ -93,6 +104,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 @require_vision
 class PerceptionLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     fast_image_processing_class = PerceptionLMImageProcessorFast if is_torchvision_available() else None
+    test_slow_image_processor = False
 
     # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaNext
     def setUp(self):
@@ -113,17 +125,20 @@ def test_image_processor_properties(self):
             self.assertTrue(hasattr(image_processing, "image_mean"))
             self.assertTrue(hasattr(image_processing, "image_std"))
             self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+            self.assertTrue(hasattr(image_processing, "max_num_tiles"))
+            self.assertTrue(hasattr(image_processing, "vision_input_type"))
 
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs
     def test_image_processor_from_dict_with_kwargs(self):
         for image_processing_class in self.image_processor_list:
             image_processor = image_processing_class.from_dict(self.image_processor_dict)
-            self.assertEqual(image_processor.size, {"shortest_edge": 20})
-            self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+            self.assertEqual(image_processor.tile_size, 16)
+            self.assertEqual(image_processor.max_num_tiles, 4)
+            self.assertEqual(image_processor.vision_input_type, "thumb+tile")
 
-            image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
-            self.assertEqual(image_processor.size, {"shortest_edge": 42})
-            self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+            image_processor = image_processing_class.from_dict(self.image_processor_dict, tile_size=42, max_num_tiles=9)
+            self.assertEqual(image_processor.tile_size, 42)
+            self.assertEqual(image_processor.max_num_tiles, 9)
+            self.assertEqual(image_processor.vision_input_type, "thumb+tile")
 
     def test_call_pil(self):
         for image_processing_class in self.image_processor_list:
@@ -136,12 +151,12 @@ def test_call_pil(self):
 
             # Test not batched input
             encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-            expected_output_image_shape = (1, 1445, 3, 18, 18)
+            expected_output_image_shape = (1, 5, 3, 16, 16)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
 
             # Test batched
             encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-            expected_output_image_shape = (7, 1445, 3, 18, 18)
+            expected_output_image_shape = (7, 5, 3, 16, 16)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
 
     def test_call_numpy(self):
@@ -155,12 +170,12 @@ def test_call_numpy(self):
 
             # Test not batched input
             encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-            expected_output_image_shape = (1, 1445, 3, 18, 18)
+            expected_output_image_shape = (1, 5, 3, 16, 16)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
 
             # Test batched
             encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-            expected_output_image_shape = (7, 1445, 3, 18, 18)
+            expected_output_image_shape = (7, 5, 3, 16, 16)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
 
     def test_call_pytorch(self):
@@ -175,17 +190,17 @@ def test_call_pytorch(self):
 
             # Test not batched input
             encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-            expected_output_image_shape = (1, 1445, 3, 18, 18)
+            expected_output_image_shape = (1, 5, 3, 16, 16)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
 
             # Test batched
             encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-            expected_output_image_shape = (7, 1445, 3, 18, 18)
+            expected_output_image_shape = (7, 5, 3, 16, 16)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
 
     @unittest.skip(
-        reason="LlavaNextImageProcessor doesn't treat 4 channel PIL and numpy consistently yet"
-    )  # FIXME Amy
+        reason="PerceptionLMImageProcessor doesn't treat 4 channel PIL and numpy consistently yet"
+    )
     def test_call_numpy_4_channels(self):
         pass
 
@@ -196,49 +211,14 @@ def test_nested_input(self):
 
             # Test batched as a list of images
             encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-            expected_output_image_shape = (7, 1445, 3, 18, 18)
+            expected_output_image_shape = (7, 5, 3, 16, 16)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
 
             # Test batched as a nested list of images, where each sublist is one batch
             image_inputs_nested = [image_inputs[:3], image_inputs[3:]]
             encoded_images_nested = image_processing(image_inputs_nested, return_tensors="pt").pixel_values
-            expected_output_image_shape = (7, 1445, 3, 18, 18)
+            expected_output_image_shape = (7, 5, 3, 16, 16)
             self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
 
             # Image processor should return same pixel values, independently of ipnut format
-            self.assertTrue((encoded_images_nested == encoded_images).all())
-
-    def test_pad_for_patching(self):
-        for image_processing_class in self.image_processor_list:
-            if image_processing_class == self.fast_image_processing_class:
-                numpify = False
-                torchify = True
-                input_data_format = image_processing_class.data_format
-            else:
-                numpify = True
-                torchify = False
-                input_data_format = ChannelDimension.LAST
-            image_processing = image_processing_class(**self.image_processor_dict)
-            # Create odd-sized images
-            image_input = self.image_processor_tester.prepare_image_inputs(
-                equal_resolution=True,
-                numpify=numpify,
-                torchify=torchify,
-            )[0]
-            self.assertIn(image_input.shape, [(3, 400, 400), (400, 400, 3)])
-
-            # Test odd-width
-            image_shape = (400, 601)
-            encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
-            encoded_image_shape = (
-                encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
-            )
-            self.assertEqual(encoded_image_shape, image_shape)
-
-            # Test odd-height
-            image_shape = (503, 400)
-            encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
-            encoded_image_shape = (
-                encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
-            )
-            self.assertEqual(encoded_image_shape, image_shape)
\ No newline at end of file
+            self.assertTrue((encoded_images_nested == encoded_images).all())
\ No newline at end of file
diff --git a/tests/models/perception_lm/test_processor_perception_lm.py b/tests/models/perception_lm/test_processor_perception_lm.py
index f03897109132..df14c82d6bcc 100644
--- a/tests/models/perception_lm/test_processor_perception_lm.py
+++ b/tests/models/perception_lm/test_processor_perception_lm.py
@@ -79,7 +79,6 @@ def prepare_processor_dict():
             "chat_template": CHAT_TEMPLATE,
             "patch_size": 14,
             "pooling_ratio": 2,
-            "processor_class": "PerceptionLMProcessor",
         }  # fmt: skip
 
     def test_chat_template_is_saved(self):

From 6f2d5a36f7e82aaa5bed15497042ba4276069d1b Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 13 Jun 2025 07:13:06 +0000
Subject: [PATCH 30/65] integration test GT update after rebasing; probably due
 to video preprocessing

---
 tests/models/perception_lm/test_modeling_perception_lm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index b9a46d82f2c7..7109b1234549 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -401,7 +401,7 @@ def test_small_model_integration_test_batched(self):
         input_length = inputs["input_ids"].shape[1]
         generate_ids_without_inputs = generate_ids[:, input_length:]
 
-        EXPECTED_DECODED_TEXT = ['The bar plot displays the values of four categories: step, horror, mood, and lumber', 'The video shows a group of people in green shirts and white shorts performing a dance routine on']  # fmt: skip
+        EXPECTED_DECODED_TEXT = ['The bar plot displays the values of four categories: step, horror, mood, and lumber', 'The video shows a group of people in green shirts and white shorts performing a jump rope routine']  # fmt: skip
 
         self.assertEqual(
             processor.batch_decode(

From 76c9c4d159549ca8d1dee225efbd1639492a95ba Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 13 Jun 2025 21:14:16 +0000
Subject: [PATCH 31/65] update test media path to hub

---
 .../test_modeling_perception_lm.py            | 23 +++++-----
 upload_script.py                              | 44 +++++++++++++++++++
 2 files changed, 54 insertions(+), 13 deletions(-)
 create mode 100644 upload_script.py

diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index 7109b1234549..ff07118550ff 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -15,10 +15,7 @@
 
 import unittest
 
-import numpy as np
-import requests
-from parameterized import parameterized
-
+from huggingface_hub import hf_hub_download
 from transformers import (
     AutoProcessor,
     AutoTokenizer,
@@ -311,16 +308,16 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
 class PerceptionLMForConditionalGenerationIntegrationTest(unittest.TestCase):
     def setUp(self):
         self.processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
-        # image_file = hf_hub_download(
-        #     repo_id="raushan-testing-hf/images_test", filename="llava_v1_5_radar.jpg", repo_type="dataset"
-        # )
-        # video_file = hf_hub_download(
-        #     repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
-        # )
-        self.image_file = (
-            "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/14496_0.PNG"
+        self.image_file = hf_hub_download(
+            repo_id="shumingh/perception_lm_test_images",
+            filename="14496_0.PNG",
+            repo_type="dataset",
+        )
+        self.video_file = hf_hub_download(
+            repo_id="shumingh/perception_lm_test_videos",
+            filename="GUWR5TyiY-M_000012_000022.mp4",
+            repo_type="dataset",
         )
-        self.video_file = "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4"
         self.conversation1 = [
             {
                 "role": "user",
diff --git a/upload_script.py b/upload_script.py
new file mode 100644
index 000000000000..1ccc0dd26ba6
--- /dev/null
+++ b/upload_script.py
@@ -0,0 +1,44 @@
+from huggingface_hub import HfApi
+from pathlib import Path
+
+# --- Configuration ---
+# Your Hugging Face username
+username = "shumingh"
+
+# Local paths to your files
+image_file_path = "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/14496_0.PNG"
+video_file_path = "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4"
+
+# Desired repository IDs on the Hub
+image_repo_id = f"{username}/perception_lm_test_images"
+video_repo_id = f"{username}/perception_lm_test_videos"
+# --- End of Configuration ---
+
+# Initialize the API
+api = HfApi()
+
+# Create the dataset repositories (if they don't exist)
+api.create_repo(repo_id=image_repo_id, repo_type="dataset", exist_ok=True)
+api.create_repo(repo_id=video_repo_id, repo_type="dataset", exist_ok=True)
+
+print(f"Uploading {image_file_path} to {image_repo_id}...")
+api.upload_file(
+    path_or_fileobj=image_file_path,
+    path_in_repo=Path(image_file_path).name,
+    repo_id=image_repo_id,
+    repo_type="dataset",
+)
+print("Image upload complete.")
+
+print(f"Uploading {video_file_path} to {video_repo_id}...")
+api.upload_file(
+    path_or_fileobj=video_file_path,
+    path_in_repo=Path(video_file_path).name,
+    repo_id=video_repo_id,
+    repo_type="dataset",
+)
+print("Video upload complete.")
+
+print("\n--- URLs ---")
+print(f"Image: https://huggingface.co/datasets/{image_repo_id}/blob/main/{Path(image_file_path).name}")
+print(f"Video: https://huggingface.co/datasets/{video_repo_id}/blob/main/{Path(video_file_path).name}") 
\ No newline at end of file

From 14c175512a2831b360989a7300c0eba64050961a Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 13 Jun 2025 21:18:46 +0000
Subject: [PATCH 32/65] Stop tracking local scripts

---
 test.py          | 53 ------------------------------------------------
 test_video.py    | 43 ---------------------------------------
 upload_script.py | 44 ----------------------------------------
 3 files changed, 140 deletions(-)
 delete mode 100644 test.py
 delete mode 100644 test_video.py
 delete mode 100644 upload_script.py

diff --git a/test.py b/test.py
deleted file mode 100644
index dda0be6f42ee..000000000000
--- a/test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from transformers import AutoProcessor
-from transformers import PerceptionLMForConditionalGeneration
-
-
-MODEL_PATH = "/checkpoint/vision_encoder/smhu/debug/plm_hf_1b"
-processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
-model = PerceptionLMForConditionalGeneration.from_pretrained(MODEL_PATH).to("cuda")
-conversation1 = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/14496_0.PNG",
-            },
-            {"type": "text", "text": "Describe the bar plot in the image."},
-        ],
-    }
-]
-conversation2 = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "video",
-                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4",
-            },
-            {"type": "text", "text": "Can you describe the video in detail?"},
-        ],
-    }
-]
-# print(model.config)
-inputs = processor.apply_chat_template(
-    [conversation1, conversation2],
-    num_frames=32,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt",
-    video_load_backend="decord",
-    padding=True,
-    padding_side="left",
-)
-inputs = inputs.to(model.device)
-# torch.save(inputs['pixel_values'], "/checkpoint/vision_encoder/smhu/debug/0/pixel_values_dump_0.pt")
-generate_ids = model.generate(**inputs, max_new_tokens=256)
-# Remove input_ids from generate_ids to get only the newly generated tokens
-input_length = inputs["input_ids"].shape[1]
-generate_ids_without_inputs = generate_ids[:, input_length:]
-
-# print(generate_ids_without_inputs.cpu().numpy().tolist())
-for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
-    print(output)
diff --git a/test_video.py b/test_video.py
deleted file mode 100644
index e25ca3790e88..000000000000
--- a/test_video.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import torch
-
-from transformers import AutoProcessor
-from transformers import PerceptionLMForConditionalGeneration
-
-processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b")
-print(type(processor))
-
-model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to(torch.bfloat16).to("cuda")
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "video",
-                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4",
-            },
-            {"type": "text", "text": "Can you describe the video in detail?"},
-        ],
-    }
-]
-
-# print(model.config)
-inputs = processor.apply_chat_template(
-    conversation,
-    add_generation_prompt=True,
-    num_frames=32,
-    # video_fps=1,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt",
-    video_load_backend="decord",
-)
-inputs = inputs.to(model.device)
-# torch.save(inputs['pixel_values'], "/checkpoint/vision_encoder/smhu/debug/0/pixel_values_dump_0.pt")
-generate_ids = model.generate(**inputs, max_new_tokens=256)
-# Remove input_ids from generate_ids to get only the newly generated tokens
-input_length = inputs["input_ids"].shape[1]
-generate_ids_without_inputs = generate_ids[:, input_length:]
-
-# print(generate_ids_without_inputs.cpu().numpy().tolist())
-for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
-    print(output)
diff --git a/upload_script.py b/upload_script.py
deleted file mode 100644
index 1ccc0dd26ba6..000000000000
--- a/upload_script.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from huggingface_hub import HfApi
-from pathlib import Path
-
-# --- Configuration ---
-# Your Hugging Face username
-username = "shumingh"
-
-# Local paths to your files
-image_file_path = "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/14496_0.PNG"
-video_file_path = "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4"
-
-# Desired repository IDs on the Hub
-image_repo_id = f"{username}/perception_lm_test_images"
-video_repo_id = f"{username}/perception_lm_test_videos"
-# --- End of Configuration ---
-
-# Initialize the API
-api = HfApi()
-
-# Create the dataset repositories (if they don't exist)
-api.create_repo(repo_id=image_repo_id, repo_type="dataset", exist_ok=True)
-api.create_repo(repo_id=video_repo_id, repo_type="dataset", exist_ok=True)
-
-print(f"Uploading {image_file_path} to {image_repo_id}...")
-api.upload_file(
-    path_or_fileobj=image_file_path,
-    path_in_repo=Path(image_file_path).name,
-    repo_id=image_repo_id,
-    repo_type="dataset",
-)
-print("Image upload complete.")
-
-print(f"Uploading {video_file_path} to {video_repo_id}...")
-api.upload_file(
-    path_or_fileobj=video_file_path,
-    path_in_repo=Path(video_file_path).name,
-    repo_id=video_repo_id,
-    repo_type="dataset",
-)
-print("Video upload complete.")
-
-print("\n--- URLs ---")
-print(f"Image: https://huggingface.co/datasets/{image_repo_id}/blob/main/{Path(image_file_path).name}")
-print(f"Video: https://huggingface.co/datasets/{video_repo_id}/blob/main/{Path(video_file_path).name}") 
\ No newline at end of file

From dbf35c1e55812c72476aec0ef52cb148cc44cb92 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 13 Jun 2025 22:32:01 +0000
Subject: [PATCH 33/65] address some review comments

---
 .../models/auto/image_processing_auto.py           |  1 -
 .../perception_lm/configuration_perception_lm.py   |  3 +--
 .../convert_perception_lm_weights_to_hf.py         |  1 -
 .../image_processing_perception_lm_fast.py         |  1 -
 .../models/perception_lm/modeling_perception_lm.py | 14 +++++++-------
 .../models/perception_lm/modular_perception_lm.py  | 12 ++++++------
 .../perception_lm/processing_perception_lm.py      |  7 -------
 7 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index c1839598f417..a8c7a6a198e6 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -478,7 +478,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 raise initial_exception
 
         image_processor_type = config_dict.get("image_processor_type", None)
-        print("image_processor_type", image_processor_type)
         image_processor_auto_map = None
         if "AutoImageProcessor" in config_dict.get("auto_map", {}):
             image_processor_auto_map = config_dict["auto_map"]["AutoImageProcessor"]
diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index bc08ba0fcacf..7578b5f9bfd5 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -159,7 +159,6 @@ def __init__(
         projector_pooling_ratio=1,
         image_token_id=128002,
         video_token_id=128003,
-        tie_word_embeddings=False,
         **kwargs,
     ):
         self.image_token_id = image_token_id
@@ -180,7 +179,7 @@ def __init__(
 
         self.text_config = text_config
         self.projector_pooling_ratio = projector_pooling_ratio
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        super().__init__(**kwargs)
 
 
 __all__ = ["PerceptionLMConfig", "PerceptionEncoderConfig"]
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 441b8cb217eb..556293c479bc 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -428,7 +428,6 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             vision_config=vision_config.to_dict(),
             projector_pooling_ratio=projector_pooling_ratio,
             image_token_id=image_token_id,
-            tie_word_embeddings=tie_word_embeddings,
         )
 
         config.save_pretrained(tmp_model_path)
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index c929f23e3223..e5529c3c49d1 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -85,7 +85,6 @@ def _preprocess(
         **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]
     ) -> BatchFeature:
         # Group images by size for batched transformation
-        del kwargs
         if images:
             grouped_images, grouped_images_index = group_images_by_shape(images)
             resized_images_grouped = {}
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 14d2690de08d..8616bd73d41b 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -52,7 +52,6 @@ def __init__(self, config: PerceptionEncoderConfig):
             ref_feat_shape=config.ref_feat_shape,
             embed_dim=config.width,
         )
-        self.eva_pe._initialize_weights = lambda x: x  # disable weight initialization
 
     def forward(self, x):
         x = self.eva_pe(x)
@@ -67,18 +66,18 @@ def __init__(self, pooling_ratio=2):
         super(AdaptiveAvgPooling, self).__init__()
         self.pooling_ratio = pooling_ratio
 
-    def forward(self, x):
-        b, num_tokens, c = x.shape
+    def forward(self, hidden_states):
+        b, num_tokens, c = hidden_states.shape
         h = int(math.sqrt(num_tokens))
         if h * h != num_tokens:
             raise ValueError(f"num_tokens {num_tokens} is expected to be a square number")
 
         shape = (h // self.pooling_ratio, h // self.pooling_ratio)
-        x = x.permute(0, 2, 1).reshape(b, -1, h, h)
-        x = F.adaptive_avg_pool2d(x, shape)
-        x = x.flatten(2).transpose(1, 2)
+        hidden_states = hidden_states.permute(0, 2, 1).reshape(b, -1, h, h)
+        hidden_states = F.adaptive_avg_pool2d(hidden_states, shape)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
 
-        return x
+        return hidden_states
 
 
 class PerceptionLMMultiModalProjector(nn.Module):
@@ -125,6 +124,7 @@ class PerceptionLMPreTrainedModel(PreTrainedModel):
     _supports_sdpa = True
     _supports_quantized_cache = True
     _supports_static_cache = True
+    _supports_flex_attn = True
     _supports_attention_backend = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 00c83fc37b84..4f7c4d5eaf9e 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -89,8 +89,8 @@ def __init__(self, pooling_ratio=2):
         super(AdaptiveAvgPooling, self).__init__()
         self.pooling_ratio = pooling_ratio
 
-    def forward(self, x):
-        b, num_tokens, c = x.shape
+    def forward(self, hidden_states):
+        b, num_tokens, c = hidden_states.shape
         h = int(math.sqrt(num_tokens))
         if h * h != num_tokens:
             raise ValueError(
@@ -98,11 +98,11 @@ def forward(self, x):
             )
 
         shape = (h // self.pooling_ratio, h // self.pooling_ratio)
-        x = x.permute(0, 2, 1).reshape(b, -1, h, h)
-        x = F.adaptive_avg_pool2d(x, shape)
-        x = x.flatten(2).transpose(1, 2)
+        hidden_states = hidden_states.permute(0, 2, 1).reshape(b, -1, h, h)
+        hidden_states = F.adaptive_avg_pool2d(hidden_states, shape)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
 
-        return x
+        return hidden_states
 
 
 class PerceptionLMMultiModalProjector(nn.Module):
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 79734f79dafd..1f0122569bc0 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -73,12 +73,6 @@ class PerceptionLMProcessor(ProcessorMixin):
     """
 
     attributes = ["video_processor", "image_processor", "tokenizer"]
-    valid_kwargs = [
-        "chat_template",
-        "patch_size",
-        "image_token",
-        "num_additional_image_tokens",
-    ]
     image_processor_class = "AutoImageProcessor"
     video_processor_class = "AutoVideoProcessor"
     tokenizer_class = "AutoTokenizer"
@@ -191,7 +185,6 @@ def _expand_media_tokens(self, sample, media_token: str, media_iter: Iterable):
                 num_media_tokens = (height // self.patch_size // self.pooling_ratio) * (
                     width // self.patch_size // self.pooling_ratio
                 ) * num_tiles
-                print("num_media_tokens", num_media_tokens)
                 media_token_list.append(num_media_tokens)
             sample = ""
             for i, num_media_tokens in enumerate(media_token_list):

From 15b176aac5a3b201f8421fee171423397905de41 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 05:14:02 +0000
Subject: [PATCH 34/65] refactor image processing.

---
 .../image_processing_perception_lm_fast.py    | 281 ++++++++++--
 .../models/perception_lm/image_transform.py   | 404 ------------------
 .../perception_lm/processing_perception_lm.py |  44 +-
 3 files changed, 271 insertions(+), 458 deletions(-)
 delete mode 100644 src/transformers/models/perception_lm/image_transform.py

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index e5529c3c49d1..8d70fce54433 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -15,6 +15,8 @@
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
+import math
+from functools import reduce
 
 from transformers.models.perception_lm.image_transform import get_image_transform
 
@@ -26,18 +28,29 @@
     DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
+    get_image_size,
 )
 from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
     add_start_docstrings,
     is_torch_available,
+    is_torchvision_available,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    PILImageResampling,
 )
-from ...image_utils import PILImageResampling, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
 
 if is_torch_available():
     import torch
 
+if is_torchvision_available():
+    from torchvision.transforms import functional as F
+
+
 class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     vision_input_type: str = "thumb+tile"
     tile_size: int = 448
@@ -58,20 +71,210 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
     do_convert_rgb = True
     valid_kwargs = PerceptionLMFastImageProcessorKwargs
 
-
     def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> None:
         super().__init__(**kwargs)
-        self.image_transform = get_image_transform(
-            vision_input_type=self.vision_input_type,
-            image_res=self.tile_size,
-            max_num_tiles=self.max_num_tiles,
-        )
 
     def to_dict(self):
         dictionary = super().to_dict()
         dictionary["image_transform"] = self.image_transform.to_dict()
         return dictionary
 
+    @staticmethod
+    def _factors(n: int):
+        """Return all factors of a number."""
+        return set(
+            reduce(
+                list.__add__,
+                ([i, n // i] for i in range(1, int(n**0.5) + 1) if n % i == 0),
+            )
+        )
+
+    def _find_supported_aspect_ratios(self):
+        """
+        This function computes all the allowed aspect ratios for a fixed
+        number of input chunks.
+
+        For example, with `num_tiles=5`, it will return:
+        {
+            0.2: [(1, 5)],
+            5.0: [(5, 1)],
+            0.25: [(1, 4)],
+            1.0: [(2, 2), (1, 1)],
+            4.0: [(4, 1)],
+            0.3333333333333333: [(1, 3)],
+            3.0: [(3, 1)],
+            0.5: [(1, 2)],
+            2.0: [(2, 1)]
+        }
+        """
+        asp_dict = {}
+        for chunk_size in range(self.max_num_tiles, 0, -1):
+            _factors = sorted(self._factors(chunk_size))
+            _asp_ratios = [(x, chunk_size // x) for x in _factors]
+            for ratio in _asp_ratios:
+                k = ratio[0] / ratio[1]
+                if k not in asp_dict:
+                    asp_dict[k] = [ratio]
+                else:
+                    asp_dict[k].append(ratio)
+        return asp_dict
+
+    def _get_image_height_width(
+        self, image_width: int, image_height: int, target_width: int, target_height: int
+    ) -> Tuple[int, int]:
+        """
+        Given image width, height and target width, height for the canvas, return the dimensions of how the image would be resized
+        with aspect ratio preservation.
+        """
+        scale = image_width / image_height
+
+        if scale > 1.0:
+            # Width is larger than height
+
+            # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
+            rescaling_factor = min(
+                target_width / image_width, target_height / image_height
+            )
+
+            # Set new width to target width and height to the rescaled height.
+            new_w = rescaling_factor * image_width
+            new_h = math.floor(new_w / scale)
+
+        else:
+            # Height is larger than width
+
+            # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
+            rescaling_factor = min(
+                target_width / image_width, target_height / image_height
+            )
+
+            # Set new height to target height and width to the rescaled width.
+            new_h = rescaling_factor * image_height
+            new_w = math.floor(new_h * scale)
+
+        return new_w, new_h
+
+    def _fit_image_to_canvas(self, img_width: int, img_height: int, tile_size: int):
+        """
+        Given an image width, height and target number of chunks this function will see if the image
+        can be fit into any of the canvases that can be build from arranging the tiles in a grid.
+        If the image can be fit onto several canvases, it will return the canvas where the shorter edge
+        of the image will be largest.
+        """
+        # Initialize the optimal canvas to None. If no canvas is found where image fits, function returns None.
+        optimal_canvas = None
+        optimal_image_width_height = None
+
+        scale = img_width / img_height
+
+        # Gather all potential supported image resolutions and iterate through them to find best match
+        potential_arrangements = [
+            item
+            for sublist in self._find_supported_aspect_ratios().values()
+            for item in sublist
+        ]
+        for n_w, n_h in potential_arrangements:
+            # Compute the canvas size
+            canvas_width, canvas_height = n_w * tile_size, n_h * tile_size
+
+            # Check if image can fit into the canvas without downsampling
+            if canvas_width >= img_width and canvas_height >= img_height:
+                # If we did not find a good canvas yet, we will use the current one
+                if optimal_canvas is None:
+                    # Set optimal canvas and determine the actual image height and width in the canvas with aspect ratio preserving resampling
+                    optimal_canvas = (n_w, n_h)
+                    optimal_image_width_height = self._get_image_height_width(
+                        image_width=img_width,
+                        image_height=img_height,
+                        target_width=n_w * tile_size,
+                        target_height=n_h * tile_size,
+                    )
+                else:
+                    # If we already found an optimal canvas before, we will check if the shorter edge of the image will be larger than the current optimal canvas.
+                    # This means we can potentially upsample the image resolution which is beneficial to performance.
+                    image_width_height = self._get_image_height_width(
+                        image_width=img_width,
+                        image_height=img_height,
+                        target_width=n_w * tile_size,
+                        target_height=n_h * tile_size,
+                    )
+                    # Llama3V dynamic tiling. Priortize biggest canvas.
+                    if (
+                        scale < 1.0
+                        and (image_width_height[0] >= optimal_image_width_height[0])
+                    ) or (
+                        scale >= 1.0
+                        and (image_width_height[1] >= optimal_image_width_height[1])
+                    ):
+                        optimal_canvas = (n_w, n_h)
+                        optimal_image_width_height = image_width_height
+        return optimal_canvas
+
+    def _find_closest_aspect_ratio(self, img_width: int, img_height: int) -> Tuple:
+        """
+        Given an image width, height and target number of chunks
+        this function will find the closest supported aspect ratio.
+        """
+        tgt_ar = img_width / img_height
+        asp_dict = self._find_supported_aspect_ratios()
+        cl_d, cl_p = 1e23, None
+        if tgt_ar >= 1:
+            cl_p = min(
+                [k for k in asp_dict.keys() if k <= tgt_ar],
+                key=lambda x: abs(x - tgt_ar),
+            )
+            v = asp_dict[cl_p]
+            # select width
+            widths = [(idx, self.size * vv[0]) for idx, vv in enumerate(v)]
+            tgt_idx = max(widths, key=lambda x: x[1])[0]
+        else:
+            cl_p = min(
+                [k for k in asp_dict.keys() if k > tgt_ar],
+                key=lambda x: abs(1 / x - 1 / tgt_ar),
+            )
+            v = asp_dict[cl_p]
+            # select height
+            heights = [(idx, self.size * vv[1]) for idx, vv in enumerate(v)]
+            tgt_idx = max(heights, key=lambda x: x[1])[0]
+        out = v[tgt_idx]
+        return out
+
+    def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
+        # Split image into number of required tiles (width x height)
+        batch_size, num_channels, height, width = image.size()
+        image = image.view(
+            batch_size, num_channels, nch, height // nch, ncw, width // ncw
+        )
+        # Permute dimensions to reorder the axes
+        image = image.permute(0, 2, 4, 1, 3, 5).contiguous()
+        # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
+        image = image.view(
+            batch_size, ncw * nch, num_channels, height // nch, width // ncw
+        )
+        return image
+
+    def resize(
+        self,
+        image: np.ndarray,
+        tile_size: int,
+        max_num_tiles: int,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        h, w = get_image_size(image, channel_dim=input_data_format)
+        if max_num_tiles > 1:
+            ar = self._fit_image_to_canvas(
+                img_width=w, img_height=h, tile_size=tile_size
+            )
+            if ar is None:
+                # If we did not find a canvas, we have to find the closest aspect ratio and downsample the image
+                ar = self._find_closest_aspect_ratio(img_width=w, img_height=h)
+        else:
+            ar = (1, 1)
+        new_w, new_h = ar[0] * tile_size, ar[1] * tile_size
+        image = F.resize(image, (new_h, new_w), interpolation=resample)
+        return image, ar
+
     def _preprocess(
         self,
         images: List["torch.Tensor"],
@@ -85,29 +288,49 @@ def _preprocess(
         **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]
     ) -> BatchFeature:
         # Group images by size for batched transformation
-        if images:
-            grouped_images, grouped_images_index = group_images_by_shape(images)
-            resized_images_grouped = {}
-            for shape, stacked_images in grouped_images.items():
-                if do_resize:
-                    stacked_images, _ = self.image_transform(stacked_images)
-                resized_images_grouped[shape] = stacked_images
-            resized_images = reorder_images(resized_images_grouped, grouped_images_index)
-                
-            grouped_images, grouped_images_index = group_images_by_shape(resized_images)
-            processed_images_grouped = {}
-            for shape, stacked_images in grouped_images.items():
-                # Fused rescale and normalize
-                stacked_images = self.rescale_and_normalize(
-                    stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+        grouped_images, grouped_images_index = group_images_by_shape(images)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                thumbnails, _ = self.resize(
+                    stacked_images, self.tile_size, max_num_tiles=1
                 )
-                processed_images_grouped[shape] = stacked_images
-            processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-            
-            processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
-            return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
-        else:
-            return BatchFeature(data={"pixel_values": None}, tensor_type=return_tensors)
+                images_for_tiling, (tiles_w, tiles_h) = self.resize(
+                    stacked_images, self.tile_size, max_num_tiles=self.max_num_tiles
+                )
+                image_tiles = self._split(images_for_tiling, tiles_w, tiles_h)
+                stacked_images = torch.cat([thumbnails.unsqueeze(1), image_tiles], dim=1)
+
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(
+            resized_images_grouped, grouped_images_index
+        )
+
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images,
+                do_rescale,
+                rescale_factor,
+                do_normalize,
+                image_mean,
+                image_std,
+            )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(
+            processed_images_grouped, grouped_images_index
+        )
+
+        processed_images = (
+            torch.stack(processed_images, dim=0)
+            if return_tensors
+            else processed_images
+        )
+        return BatchFeature(
+            data={"pixel_values": processed_images}, tensor_type=return_tensors
+        )
 
 
 __all__ = ["PerceptionLMImageProcessorFast"]
diff --git a/src/transformers/models/perception_lm/image_transform.py b/src/transformers/models/perception_lm/image_transform.py
deleted file mode 100644
index ebaa1e15fbb9..000000000000
--- a/src/transformers/models/perception_lm/image_transform.py
+++ /dev/null
@@ -1,404 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-
-import math
-from functools import reduce
-from logging import getLogger
-from typing import Any, Callable, Tuple, Union, Sequence
-
-import torch
-import torchvision.transforms as tv
-from PIL import Image
-from torchvision.transforms import functional as F
-from torchvision.transforms.functional import InterpolationMode
-from torchvision.transforms import ToPILImage, PILToTensor
-
-
-logger = getLogger()
-
-
-"""
-Resize the image to the given size. Supports both PIL images and torch.Tensor.
-If the image is a tensor, it's supposed to be a batch of images with shape (B, C, H, W) and dtype uint8.
-If use_pil_resize is True, the images will be resized using PIL implementation of interpolation.
-"""
-
-
-def _resize(
-    image: Union[Image.Image, torch.Tensor],
-    size: Sequence[int],
-    use_pil_resize: bool = True,
-) -> Union[Image.Image, torch.Tensor]:
-    if isinstance(image, torch.Tensor) and use_pil_resize:
-        ims = []
-        for im in image:
-            im = ToPILImage()(im)
-            im = F.resize(im, size, interpolation=InterpolationMode.BICUBIC)
-            ims.append(PILToTensor()(im))
-        return torch.stack(ims, dim=0)
-    else:
-        return F.resize(image, size, interpolation=InterpolationMode.BICUBIC)
-
-
-def get_image_transform(
-    vision_input_type: str = "vanilla",
-    image_res: int = 336,
-    max_num_tiles: int = 1,
-) -> Tuple[Callable, int]:
-
-    if vision_input_type == "thumb+tile":
-        transforms = VariableSizeImageTransform(
-            size=image_res,
-            max_num_tiles=max_num_tiles,
-            use_thumbnail="before",
-        )
-    else:
-        transforms = ImageTransform(
-            size=image_res,
-        )
-
-    logger.info(
-        f"Initalized transforms with: vision_input_type: '{vision_input_type}' and max_num_tiles: {max_num_tiles}."
-    )
-
-    return transforms
-
-
-class ImageTransform(object):
-    """
-    Vanilla Image transform.
-    """
-    def __init__(
-        self,
-        size: int = 336,
-    ) -> None:
-        self.size = size
-        logger.info(f"ImageTransform size: {self.size}")
-        self.to_tensor = tv.ToTensor()
-
-    def to_dict(self):
-        return {
-            "size": self.size,
-        }
-
-    def __call__(self, image: Union[Image.Image, torch.Tensor]):
-        if isinstance(image, Image.Image):
-            w, h = image.size
-        else:
-            h, w = image.shape[-2:]
-
-        image = _resize(
-            image,
-            (self.size, self.size),
-            use_pil_resize=False,
-        )
-        if isinstance(image, Image.Image):
-            image = self.to_tensor(image)
-        else:
-            image = F.convert_image_dtype(image, torch.float32)
-
-        # Add chunk dim to make it compatible with existing dataloaders
-        image = image.unsqueeze(-4)
-        return image, (w, h)
-
-
-class VariableSizeImageTransform(object):
-    """
-    The variable size image transform will resize the image dynamically
-    based on the image aspect ratio and the number of image chunks we allow.
-
-    The algorithm will not upsample low-res images to fit a certain aspect
-    ratio, because that leads to a significant degradation in image quality.
-
-    For example, if an input image is of size 300x800, and we want to allow
-    a maximum of 16 image chunks, it will find the closest aspect ratio that
-    is allowed within 16 image chunks, i.e., 2:5 = 2 horizontal patches and
-    5 vertical patches, giving a total of 10 chunks.
-
-    The image will then be resized to products of the base size (default is
-    224px because MetaCLIP takes that), so in this case it will  be resized to
-    2*224:5*224 = 448:1120, where we maintain the original aspect ratio and
-    pad with the mean value for the rest. This approach minimizes the amount
-    of padding required for any arbitrary resolution.
-
-    The final output will therefore be of shape (11, 3, 224, 224), where 10
-    patches are coming from the resizing and chunking, and the first patch
-    is a downsampled version of the image that preserves aspect ratios.
-    """
-
-    def __init__(
-        self,
-        size: int = 336,
-        max_num_tiles: int = 1,
-        use_thumbnail: str = "no",
-        area_limit: bool = False,
-    ) -> None:
-        self.size = size
-
-        logger.info(f"VariableSizeImageTransform size: {self.size}")
-
-        self.to_tensor = tv.ToTensor()
-        self.area_limit = area_limit
-        self.max_num_tiles = max_num_tiles
-        self.use_thumbnail = use_thumbnail
-        if self.use_thumbnail != "no":
-            self.thumbnail_transform = ImageTransform(
-                size=self.size,
-            )
-
-    def to_dict(self):
-        return {
-            "size": self.size,
-            "max_num_tiles": self.max_num_tiles,
-            "use_thumbnail": self.use_thumbnail,
-        }
-
-    @staticmethod
-    def _factors(n: int):
-        """Return all factors of a number."""
-        return set(
-            reduce(
-                list.__add__,
-                ([i, n // i] for i in range(1, int(n**0.5) + 1) if n % i == 0),
-            )
-        )
-
-    def _find_supported_aspect_ratios(self):
-        """
-        This function computes all the allowed aspect ratios for a fixed
-        number of input chunks.
-
-        For example, with `num_tiles=5`, it will return:
-        {
-            0.2: [(1, 5)],
-            5.0: [(5, 1)],
-            0.25: [(1, 4)],
-            1.0: [(2, 2), (1, 1)],
-            4.0: [(4, 1)],
-            0.3333333333333333: [(1, 3)],
-            3.0: [(3, 1)],
-            0.5: [(1, 2)],
-            2.0: [(2, 1)]
-        }
-        """
-        asp_dict = {}
-        for chunk_size in range(self.max_num_tiles, 0, -1):
-            _factors = sorted(VariableSizeImageTransform._factors(chunk_size))
-            _asp_ratios = [(x, chunk_size // x) for x in _factors]
-            for ratio in _asp_ratios:
-                k = ratio[0] / ratio[1]
-                if k not in asp_dict:
-                    asp_dict[k] = [ratio]
-                else:
-                    asp_dict[k].append(ratio)
-        return asp_dict
-
-    def _find_closest_aspect_ratio(self, img_width: int, img_height: int) -> Tuple:
-        """
-        Given an image width, height and target number of chunks
-        this function will find the closest supported aspect ratio.
-        """
-        tgt_ar = img_width / img_height
-        asp_dict = self._find_supported_aspect_ratios()
-        cl_d, cl_p = 1e23, None
-        if tgt_ar >= 1:
-            cl_p = min(
-                [k for k in asp_dict.keys() if k <= tgt_ar],
-                key=lambda x: abs(x - tgt_ar),
-            )
-            v = asp_dict[cl_p]
-            # select width
-            widths = [(idx, self.size * vv[0]) for idx, vv in enumerate(v)]
-            tgt_idx = max(widths, key=lambda x: x[1])[0]
-        else:
-            cl_p = min(
-                [k for k in asp_dict.keys() if k > tgt_ar],
-                key=lambda x: abs(1 / x - 1 / tgt_ar),
-            )
-            v = asp_dict[cl_p]
-            # select height
-            heights = [(idx, self.size * vv[1]) for idx, vv in enumerate(v)]
-            tgt_idx = max(heights, key=lambda x: x[1])[0]
-        out = v[tgt_idx]
-        return out
-
-    def _pad(
-        self, image: Union[Image.Image, torch.Tensor], new_width: int, new_height: int
-    ) -> Union[Image.Image, torch.Tensor]:
-        if isinstance(image, Image.Image):
-            new_im = Image.new(mode="RGB", size=(new_width, new_height), color=(0, 0, 0))  # type: ignore
-            new_im.paste(image)
-            return new_im
-        else:
-            return F.pad(
-                image, (0, 0, new_width - image.shape[-1], new_height - image.shape[-2])
-            )
-
-    def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
-        # Split image into number of required tiles (width x height)
-        batch_size, num_channels, height, width = image.size()
-        image = image.view(
-            batch_size, num_channels, nch, height // nch, ncw, width // ncw
-        )
-        # Permute dimensions to reorder the axes
-        image = image.permute(0, 2, 4, 1, 3, 5).contiguous()
-        # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
-        image = image.view(
-            batch_size, ncw * nch, num_channels, height // nch, width // ncw
-        )
-        return image
-
-    def _get_image_height_width(
-        self, image_width: int, image_height: int, target_width: int, target_height: int
-    ) -> Tuple[int, int]:
-        """
-        Given image width, height and target width, height for the canvas, return the dimensions of how the image would be resized
-        with aspect ratio preservation.
-        """
-        scale = image_width / image_height
-
-        if scale > 1.0:
-            # Width is larger than height
-
-            # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
-            rescaling_factor = min(
-                target_width / image_width, target_height / image_height
-            )
-
-            # Set new width to target width and height to the rescaled height.
-            new_w = rescaling_factor * image_width
-            new_h = math.floor(new_w / scale)
-
-        else:
-            # Height is larger than width
-
-            # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
-            rescaling_factor = min(
-                target_width / image_width, target_height / image_height
-            )
-
-            # Set new height to target height and width to the rescaled width.
-            new_h = rescaling_factor * image_height
-            new_w = math.floor(new_h * scale)
-
-        return new_w, new_h
-
-    def _fit_image_to_canvas(
-        self, img_width: int, img_height: int, area_limit: bool
-    ) -> Any:
-        """
-        Given an image width, height and target number of chunks this function will see if the image
-        can be fit into any of the canvases that can be build from arranging the tiles in a grid.
-        If the image can be fit onto several canvases, it will return the canvas where the shorter edge
-        of the image will be largest.
-
-        If area_limit is set to True, the tie-breaking prefers the canvas where area is less than 2x the original area.
-        """
-        # Initialize the optimal canvas to None. If no canvas is found where image fits, function returns None.
-        optimal_canvas = None
-        optimal_image_width_height = None
-
-        scale = img_width / img_height
-
-        # Gather all potential supported image resolutions and iterate through them to find best match
-        potential_arrangements = [
-            item
-            for sublist in self._find_supported_aspect_ratios().values()
-            for item in sublist
-        ]
-        for n_w, n_h in potential_arrangements:
-            # Compute the canvas size
-            canvas_width, canvas_height = n_w * self.size, n_h * self.size
-
-            # Check if image can fit into the canvas without downsampling
-            if canvas_width >= img_width and canvas_height >= img_height:
-                # If we did not find a good canvas yet, we will use the current one
-                if optimal_canvas is None:
-                    # Set optimal canvas and determine the actual image height and width in the canvas with aspect ratio preserving resampling
-                    optimal_canvas = (n_w, n_h)
-                    optimal_image_width_height = self._get_image_height_width(
-                        image_width=img_width,
-                        image_height=img_height,
-                        target_width=n_w * self.size,
-                        target_height=n_h * self.size,
-                    )
-                else:
-                    # If we already found an optimal canvas before, we will check if the shorter edge of the image will be larger than the current optimal canvas.
-                    # This means we can potentially upsample the image resolution which is beneficial to performance.
-                    image_width_height = self._get_image_height_width(
-                        image_width=img_width,
-                        image_height=img_height,
-                        target_width=n_w * self.size,
-                        target_height=n_h * self.size,
-                    )
-                    if area_limit:
-                        # Prioritize aspect ratio, and choose best within area limit when tied.
-                        curr_scale = image_width_height[0] / image_width_height[1]
-                        optim_scale = (
-                            optimal_image_width_height[0]
-                            / optimal_image_width_height[1]
-                        )
-                        if abs(scale - curr_scale) < abs(scale - optim_scale):
-                            # 1. optimize aspect ratio
-                            optimal_canvas = (n_w, n_h)
-                            optimal_image_width_height = image_width_height
-                        elif abs(scale - curr_scale) == abs(scale - optim_scale):
-                            # 2. optimize area
-                            if (
-                                image_width_height[0] * image_width_height[1]
-                                < 2 * img_width * img_height
-                            ):
-                                # 2.1 area is less than 2x the original area
-                                optimal_canvas = (n_w, n_h)
-                                optimal_image_width_height = image_width_height
-                    else:
-                        # NOTE: L3V dynamid tiling. Priortize biggest canvas.
-                        if (
-                            scale < 1.0
-                            and (image_width_height[0] >= optimal_image_width_height[0])
-                        ) or (
-                            scale >= 1.0
-                            and (image_width_height[1] >= optimal_image_width_height[1])
-                        ):
-                            optimal_canvas = (n_w, n_h)
-                            optimal_image_width_height = image_width_height
-        return optimal_canvas
-
-    def __call__(
-        self, image: Union[Image.Image, torch.Tensor] = None
-    ) -> Tuple[Any, Any]:
-        if self.use_thumbnail != "no":
-            thumbnail = self.thumbnail_transform(image)[0]
-
-        if isinstance(image, Image.Image):
-            w, h = image.size
-        else:
-            h, w = image.shape[-2:]
-
-        # Check if the image can be fit to the canvas without downsampling
-        ar = self._fit_image_to_canvas(
-            img_width=w, img_height=h, area_limit=self.area_limit
-        )
-        if ar is None:
-            # If we did not find a canvas, we have to find the closest aspect ratio and downsample the image
-            ar = self._find_closest_aspect_ratio(img_width=w, img_height=h)
-
-        image = _resize(
-            image,
-            (ar[1] * self.size, ar[0] * self.size),  # (h, w)
-            use_pil_resize=True,
-        )
-        image = self._pad(image, ar[0] * self.size, ar[1] * self.size)
-
-        if isinstance(image, Image.Image):
-            image = self.to_tensor(image)
-        else:
-            image = F.convert_image_dtype(image, torch.float32)
-
-        image = self._split(image, ar[0], ar[1])  # type: ignore
-        if self.use_thumbnail == "before":
-            image = torch.cat((thumbnail, image), dim=1)
-        elif self.use_thumbnail == "after":
-            image = torch.cat((image, thumbnail), dim=1)
-        elif self.use_thumbnail == "both":
-            image = torch.cat((thumbnail, image, thumbnail), dim=1)
-        return image, ar
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 1f0122569bc0..12f07814fa13 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -36,7 +36,7 @@ class PerceptionLMProcessorKwargs(ProcessingKwargs, total=False):
         },
         "images_kwargs": {
             "do_resize": True,
-            "do_rescale": False,
+            "do_rescale": True,
             "do_normalize": True,
             "size": 448,
             "resample": PILImageResampling.BICUBIC,
@@ -60,16 +60,10 @@ class PerceptionLMProcessor(ProcessorMixin):
             The tokenizer is a required input.
         patch_size (`int`, *optional*):
             Patch size from the vision tower.
-        vision_feature_select_strategy (`str`, *optional*):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Shoudl be same as in model's config
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
             in a chat into a tokenizable string.
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
+        pooling_ratio (`int`, *optional*):
+            Pooling ratio for vision tokens. If not 1, we do 2D adaptive pooling over projected vision tokens.
     """
 
     attributes = ["video_processor", "image_processor", "tokenizer"]
@@ -104,20 +98,20 @@ def __call__(
         **kwargs: Unpack[PerceptionLMProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PerceptionLMTokenizerFast's [`~PerceptionLMTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-    of the above two methods for more information.
+        Prepares a batch containing one or more sequences of text and/or images and/or videos.
+
+        If `text` is provided, it is tokenized using the tokenizer.
+        If `images` is provided, they are processed using the image processor.
+        If `videos` is provided, they are processed using the video processor.
 
         Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
+                The image or batch of images to be processed. Each image can be a PIL image, NumPy array, or PyTorch tensor.
+                Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, *optional*):
+                The sequence or batch of sequences to be tokenized. Each sequence can be a string.
+            videos (`Any`, *optional*):
+                The video or batch of videos to be processed.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
                 - `'tf'`: Return TensorFlow `tf.constant` objects.
@@ -128,11 +122,11 @@ def __call__(
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is provided.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is provided).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is provided.
+            - **pixel_values_videos** -- Video pixel values to be fed to a model. Returned when `videos` is provided.
         """
         if images is None and text is None:
             raise ValueError("You have to specify at least one of `images` or `text`.")

From 0337ce1ba205b988a0d3f4639c82f71368a41847 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 05:36:31 +0000
Subject: [PATCH 35/65] small fixes

---
 .../perception_lm/image_processing_perception_lm_fast.py   | 7 -------
 .../models/perception_lm/processing_perception_lm.py       | 6 ++++--
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 8d70fce54433..72e638a764a7 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -18,8 +18,6 @@
 import math
 from functools import reduce
 
-from transformers.models.perception_lm.image_transform import get_image_transform
-
 from ...image_processing_utils import (
     BatchFeature,
 )
@@ -74,11 +72,6 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
     def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> None:
         super().__init__(**kwargs)
 
-    def to_dict(self):
-        dictionary = super().to_dict()
-        dictionary["image_transform"] = self.image_transform.to_dict()
-        return dictionary
-
     @staticmethod
     def _factors(n: int):
         """Return all factors of a number."""
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 12f07814fa13..1756d4abf349 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -56,6 +56,8 @@ class PerceptionLMProcessor(ProcessorMixin):
     Args:
         image_processor ([`PerceptionLMImageProcessor`], *optional*):
             The image processor is a required input.
+        video_processor ([`PerceptionLMVideoProcessor`], *optional*):
+            The video processor is a required input.
         tokenizer ([`PerceptionLMTokenizerFast`], *optional*):
             The tokenizer is a required input.
         patch_size (`int`, *optional*):
@@ -128,8 +130,8 @@ def __call__(
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is provided.
             - **pixel_values_videos** -- Video pixel values to be fed to a model. Returned when `videos` is provided.
         """
-        if images is None and text is None:
-            raise ValueError("You have to specify at least one of `images` or `text`.")
+        if text is None:
+            raise ValueError("You have to specify at least `text` input. Optionally, you can also specify `images` or `videos`.")
 
         output_kwargs = self._merge_kwargs(
             PerceptionLMProcessorKwargs,

From 3bc30960e412e778308e77edf5da429d2c0552a5 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 05:54:38 +0000
Subject: [PATCH 36/65] update documentation and minor fixes

---
 docs/source/en/model_doc/perception_lm.md     | 34 ++++++------
 src/transformers/models/auto/modeling_auto.py |  1 -
 .../perception_lm/modeling_perception_lm.py   | 10 +++-
 .../perception_lm/modular_perception_lm.py    | 12 ++++-
 test.py                                       | 53 +++++++++++++++++++
 test_video.py                                 | 43 +++++++++++++++
 upload_script.py                              | 44 +++++++++++++++
 7 files changed, 175 insertions(+), 22 deletions(-)
 create mode 100644 test.py
 create mode 100644 test_video.py
 create mode 100644 upload_script.py

diff --git a/docs/source/en/model_doc/perception_lm.md b/docs/source/en/model_doc/perception_lm.md
index b5875f9228a0..14de19791f3d 100644
--- a/docs/source/en/model_doc/perception_lm.md
+++ b/docs/source/en/model_doc/perception_lm.md
@@ -16,33 +16,33 @@ rendered properly in your Markdown viewer.
 
 # PerceptionLM
 
-# PerceptionLM
-
-# PerceptionLM
-
-# PerceptionLM
-
-# PerceptionLM
-
-# PerceptionLM
-
-# PerceptionLM
-
 ## Overview
 
-The PerceptionLM model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
+The PerceptionLM model was proposed in [PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding](https://ai.meta.com/research/publications/perceptionlm-open-access-data-and-models-for-detailed-visual-understanding/) by Jang Hyun Cho et al. It's a fully open, reproducible model for transparent research in image and video understanding. PLM consists of
+a vision encoder with a small scale (<8B parameters) LLM decoder.
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*Vision-language models are integral to computer vision research, yet many high-performing models
+remain closed-source, obscuring their data, design and training recipe. The research community
+has responded by using distillation from black-box models to label training data, achieving strong
+benchmark results, at the cost of measurable scientific progress. However, without knowing the details
+of the teacher model and its data sources, scientific progress remains difficult to measure. In this
+paper, we study building a Perception Language Model (PLM) in a fully open and reproducible
+framework for transparent research in image and video understanding. We analyze standard training
+pipelines without distillation from proprietary models and explore large-scale synthetic data to identify
+critical data gaps, particularly in detailed video understanding. To bridge these gaps, we release 2.8M
+human-labeled instances of fine-grained video question-answer pairs and spatio-temporally grounded
+video captions. Additionally, we introduce PLM–VideoBench, a suite for evaluating challenging video
+understanding tasks focusing on the ability to reason about “what”, “where”, “when”, and “how” of a
+video. We make our work fully reproducible by providing data, training recipes, code & models.*
 
 Tips:
 
 <INSERT TIPS ABOUT MODEL HERE>
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [shumingh](https://huggingface.co/shumingh).
+The original code can be found [here](https://github.com/facebookresearch/perception_models).
 
 
 ## PerceptionLMConfig
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 77b7564e3951..9669185e3775 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -884,7 +884,6 @@
         ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
-        ("perception_lm", "PerceptionLMForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
         ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
         ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 8616bd73d41b..a1ef54afa59f 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -281,9 +281,9 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-        if pixel_values is not None and inputs_embeds is not None:
+        if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
-                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+                "You cannot specify both (pixel_values or pixel_values_videos) and inputs_embeds at the same time, and must specify either one"
             )
 
         if inputs_embeds is None:
@@ -353,6 +353,12 @@ def set_output_embeddings(self, new_embeddings):
     def get_output_embeddings(self):
         return self.lm_head
 
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
     def prepare_inputs_for_generation(
         self,
         input_ids,
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 4f7c4d5eaf9e..46e02f40ffa8 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -251,9 +251,11 @@ def forward(
             raise ValueError(
                 "You must specify exactly one of input_ids or inputs_embeds"
             )
-        if pixel_values is not None and inputs_embeds is not None:
+        if (
+            pixel_values is not None or pixel_values_videos is not None
+        ) and inputs_embeds is not None:
             raise ValueError(
-                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+                "You cannot specify both (pixel_values or pixel_values_videos) and inputs_embeds at the same time, and must specify either one"
             )
 
         if inputs_embeds is None:
@@ -326,6 +328,12 @@ def set_output_embeddings(self, new_embeddings):
 
     def get_output_embeddings(self):
         return self.lm_head
+    
+    def set_decoder(self, decoder):
+        self.model = decoder
+    
+    def get_decoder(self):
+        return self.model
 
     def prepare_inputs_for_generation(
         self,
diff --git a/test.py b/test.py
new file mode 100644
index 000000000000..c1f9bc08c328
--- /dev/null
+++ b/test.py
@@ -0,0 +1,53 @@
+from transformers import AutoProcessor
+from transformers import PerceptionLMForConditionalGeneration
+
+
+MODEL_PATH = "/checkpoint/vision_encoder/smhu/debug/plm_hf_1b"
+processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
+model = PerceptionLMForConditionalGeneration.from_pretrained(MODEL_PATH).to("cuda")
+conversation1 = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/89_0.PNG",
+            },
+            {"type": "text", "text": "Describe the bar plot in the image."},
+        ],
+    }
+]
+conversation2 = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4",
+            },
+            {"type": "text", "text": "Can you describe the video in detail?"},
+        ],
+    }
+]
+# print(model.config)
+inputs = processor.apply_chat_template(
+    [conversation1, conversation2],
+    num_frames=32,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    video_load_backend="decord",
+    padding=True,
+    padding_side="left",
+)
+inputs = inputs.to(model.device)
+# torch.save(inputs['pixel_values'], "/checkpoint/vision_encoder/smhu/debug/0/pixel_values_dump_0.pt")
+generate_ids = model.generate(**inputs, max_new_tokens=256)
+# Remove input_ids from generate_ids to get only the newly generated tokens
+input_length = inputs["input_ids"].shape[1]
+generate_ids_without_inputs = generate_ids[:, input_length:]
+
+# print(generate_ids_without_inputs.cpu().numpy().tolist())
+for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
+    print(output)
diff --git a/test_video.py b/test_video.py
new file mode 100644
index 000000000000..e25ca3790e88
--- /dev/null
+++ b/test_video.py
@@ -0,0 +1,43 @@
+import torch
+
+from transformers import AutoProcessor
+from transformers import PerceptionLMForConditionalGeneration
+
+processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b")
+print(type(processor))
+
+model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to(torch.bfloat16).to("cuda")
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4",
+            },
+            {"type": "text", "text": "Can you describe the video in detail?"},
+        ],
+    }
+]
+
+# print(model.config)
+inputs = processor.apply_chat_template(
+    conversation,
+    add_generation_prompt=True,
+    num_frames=32,
+    # video_fps=1,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    video_load_backend="decord",
+)
+inputs = inputs.to(model.device)
+# torch.save(inputs['pixel_values'], "/checkpoint/vision_encoder/smhu/debug/0/pixel_values_dump_0.pt")
+generate_ids = model.generate(**inputs, max_new_tokens=256)
+# Remove input_ids from generate_ids to get only the newly generated tokens
+input_length = inputs["input_ids"].shape[1]
+generate_ids_without_inputs = generate_ids[:, input_length:]
+
+# print(generate_ids_without_inputs.cpu().numpy().tolist())
+for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
+    print(output)
diff --git a/upload_script.py b/upload_script.py
new file mode 100644
index 000000000000..1ccc0dd26ba6
--- /dev/null
+++ b/upload_script.py
@@ -0,0 +1,44 @@
+from huggingface_hub import HfApi
+from pathlib import Path
+
+# --- Configuration ---
+# Your Hugging Face username
+username = "shumingh"
+
+# Local paths to your files
+image_file_path = "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/14496_0.PNG"
+video_file_path = "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4"
+
+# Desired repository IDs on the Hub
+image_repo_id = f"{username}/perception_lm_test_images"
+video_repo_id = f"{username}/perception_lm_test_videos"
+# --- End of Configuration ---
+
+# Initialize the API
+api = HfApi()
+
+# Create the dataset repositories (if they don't exist)
+api.create_repo(repo_id=image_repo_id, repo_type="dataset", exist_ok=True)
+api.create_repo(repo_id=video_repo_id, repo_type="dataset", exist_ok=True)
+
+print(f"Uploading {image_file_path} to {image_repo_id}...")
+api.upload_file(
+    path_or_fileobj=image_file_path,
+    path_in_repo=Path(image_file_path).name,
+    repo_id=image_repo_id,
+    repo_type="dataset",
+)
+print("Image upload complete.")
+
+print(f"Uploading {video_file_path} to {video_repo_id}...")
+api.upload_file(
+    path_or_fileobj=video_file_path,
+    path_in_repo=Path(video_file_path).name,
+    repo_id=video_repo_id,
+    repo_type="dataset",
+)
+print("Video upload complete.")
+
+print("\n--- URLs ---")
+print(f"Image: https://huggingface.co/datasets/{image_repo_id}/blob/main/{Path(image_file_path).name}")
+print(f"Video: https://huggingface.co/datasets/{video_repo_id}/blob/main/{Path(video_file_path).name}") 
\ No newline at end of file

From 53d2f01128e4ad9fefd29adc0c3e42422be2d35d Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 06:01:03 +0000
Subject: [PATCH 37/65] remove scripts

---
 test.py          | 53 ------------------------------------------------
 test_video.py    | 43 ---------------------------------------
 upload_script.py | 44 ----------------------------------------
 3 files changed, 140 deletions(-)
 delete mode 100644 test.py
 delete mode 100644 test_video.py
 delete mode 100644 upload_script.py

diff --git a/test.py b/test.py
deleted file mode 100644
index c1f9bc08c328..000000000000
--- a/test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from transformers import AutoProcessor
-from transformers import PerceptionLMForConditionalGeneration
-
-
-MODEL_PATH = "/checkpoint/vision_encoder/smhu/debug/plm_hf_1b"
-processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
-model = PerceptionLMForConditionalGeneration.from_pretrained(MODEL_PATH).to("cuda")
-conversation1 = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/89_0.PNG",
-            },
-            {"type": "text", "text": "Describe the bar plot in the image."},
-        ],
-    }
-]
-conversation2 = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "video",
-                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4",
-            },
-            {"type": "text", "text": "Can you describe the video in detail?"},
-        ],
-    }
-]
-# print(model.config)
-inputs = processor.apply_chat_template(
-    [conversation1, conversation2],
-    num_frames=32,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt",
-    video_load_backend="decord",
-    padding=True,
-    padding_side="left",
-)
-inputs = inputs.to(model.device)
-# torch.save(inputs['pixel_values'], "/checkpoint/vision_encoder/smhu/debug/0/pixel_values_dump_0.pt")
-generate_ids = model.generate(**inputs, max_new_tokens=256)
-# Remove input_ids from generate_ids to get only the newly generated tokens
-input_length = inputs["input_ids"].shape[1]
-generate_ids_without_inputs = generate_ids[:, input_length:]
-
-# print(generate_ids_without_inputs.cpu().numpy().tolist())
-for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
-    print(output)
diff --git a/test_video.py b/test_video.py
deleted file mode 100644
index e25ca3790e88..000000000000
--- a/test_video.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import torch
-
-from transformers import AutoProcessor
-from transformers import PerceptionLMForConditionalGeneration
-
-processor = AutoProcessor.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b")
-print(type(processor))
-
-model = PerceptionLMForConditionalGeneration.from_pretrained("/checkpoint/vision_encoder/smhu/debug/plm_hf_1b").to(torch.bfloat16).to("cuda")
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "video",
-                "url": "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4",
-            },
-            {"type": "text", "text": "Can you describe the video in detail?"},
-        ],
-    }
-]
-
-# print(model.config)
-inputs = processor.apply_chat_template(
-    conversation,
-    add_generation_prompt=True,
-    num_frames=32,
-    # video_fps=1,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt",
-    video_load_backend="decord",
-)
-inputs = inputs.to(model.device)
-# torch.save(inputs['pixel_values'], "/checkpoint/vision_encoder/smhu/debug/0/pixel_values_dump_0.pt")
-generate_ids = model.generate(**inputs, max_new_tokens=256)
-# Remove input_ids from generate_ids to get only the newly generated tokens
-input_length = inputs["input_ids"].shape[1]
-generate_ids_without_inputs = generate_ids[:, input_length:]
-
-# print(generate_ids_without_inputs.cpu().numpy().tolist())
-for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
-    print(output)
diff --git a/upload_script.py b/upload_script.py
deleted file mode 100644
index 1ccc0dd26ba6..000000000000
--- a/upload_script.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from huggingface_hub import HfApi
-from pathlib import Path
-
-# --- Configuration ---
-# Your Hugging Face username
-username = "shumingh"
-
-# Local paths to your files
-image_file_path = "/home/smhu/code/occhi/apps/plm/dummy_datasets/image/images/14496_0.PNG"
-video_file_path = "/home/smhu/code/occhi/apps/plm/dummy_datasets/video/videos/GUWR5TyiY-M_000012_000022.mp4"
-
-# Desired repository IDs on the Hub
-image_repo_id = f"{username}/perception_lm_test_images"
-video_repo_id = f"{username}/perception_lm_test_videos"
-# --- End of Configuration ---
-
-# Initialize the API
-api = HfApi()
-
-# Create the dataset repositories (if they don't exist)
-api.create_repo(repo_id=image_repo_id, repo_type="dataset", exist_ok=True)
-api.create_repo(repo_id=video_repo_id, repo_type="dataset", exist_ok=True)
-
-print(f"Uploading {image_file_path} to {image_repo_id}...")
-api.upload_file(
-    path_or_fileobj=image_file_path,
-    path_in_repo=Path(image_file_path).name,
-    repo_id=image_repo_id,
-    repo_type="dataset",
-)
-print("Image upload complete.")
-
-print(f"Uploading {video_file_path} to {video_repo_id}...")
-api.upload_file(
-    path_or_fileobj=video_file_path,
-    path_in_repo=Path(video_file_path).name,
-    repo_id=video_repo_id,
-    repo_type="dataset",
-)
-print("Video upload complete.")
-
-print("\n--- URLs ---")
-print(f"Image: https://huggingface.co/datasets/{image_repo_id}/blob/main/{Path(image_file_path).name}")
-print(f"Video: https://huggingface.co/datasets/{video_repo_id}/blob/main/{Path(video_file_path).name}") 
\ No newline at end of file

From 350aa79a1f73cf1652447cde9bf3a021ee081722 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 06:14:39 +0000
Subject: [PATCH 38/65] Minor fix for CI

---
 .../models/perception_lm/processing_perception_lm.py        | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 1756d4abf349..bfd0422f953d 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -16,7 +16,6 @@
 Processor class for PerceptionLM.
 """
 
-import torch
 from typing import List, Union, Iterable
 
 from ...feature_extraction_utils import BatchFeature
@@ -24,7 +23,10 @@
 from ...video_utils import VideoInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import logging
+from ...utils import is_torch_available, logging
+
+if is_torch_available():
+    import torch
 
 logger = logging.get_logger(__name__)
 

From d63b4f88fe566392d599b33fdf80de61e93fe5f0 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 06:26:02 +0000
Subject: [PATCH 39/65] Fix image processing

---
 .../image_processing_perception_lm_fast.py    | 24 ++++++++++---------
 .../test_image_processing_perception_lm.py    |  1 -
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 72e638a764a7..b13299edd0ec 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -203,7 +203,9 @@ def _fit_image_to_canvas(self, img_width: int, img_height: int, tile_size: int):
                         optimal_image_width_height = image_width_height
         return optimal_canvas
 
-    def _find_closest_aspect_ratio(self, img_width: int, img_height: int) -> Tuple:
+    def _find_closest_aspect_ratio(
+        self, img_width: int, img_height: int, tile_size: int
+    ) -> Tuple:
         """
         Given an image width, height and target number of chunks
         this function will find the closest supported aspect ratio.
@@ -218,7 +220,7 @@ def _find_closest_aspect_ratio(self, img_width: int, img_height: int) -> Tuple:
             )
             v = asp_dict[cl_p]
             # select width
-            widths = [(idx, self.size * vv[0]) for idx, vv in enumerate(v)]
+            widths = [(idx, tile_size * vv[0]) for idx, vv in enumerate(v)]
             tgt_idx = max(widths, key=lambda x: x[1])[0]
         else:
             cl_p = min(
@@ -227,7 +229,7 @@ def _find_closest_aspect_ratio(self, img_width: int, img_height: int) -> Tuple:
             )
             v = asp_dict[cl_p]
             # select height
-            heights = [(idx, self.size * vv[1]) for idx, vv in enumerate(v)]
+            heights = [(idx, tile_size * vv[1]) for idx, vv in enumerate(v)]
             tgt_idx = max(heights, key=lambda x: x[1])[0]
         out = v[tgt_idx]
         return out
@@ -261,7 +263,9 @@ def resize(
             )
             if ar is None:
                 # If we did not find a canvas, we have to find the closest aspect ratio and downsample the image
-                ar = self._find_closest_aspect_ratio(img_width=w, img_height=h)
+                ar = self._find_closest_aspect_ratio(
+                    img_width=w, img_height=h, tile_size=tile_size
+                )
         else:
             ar = (1, 1)
         new_w, new_h = ar[0] * tile_size, ar[1] * tile_size
@@ -292,12 +296,12 @@ def _preprocess(
                     stacked_images, self.tile_size, max_num_tiles=self.max_num_tiles
                 )
                 image_tiles = self._split(images_for_tiling, tiles_w, tiles_h)
-                stacked_images = torch.cat([thumbnails.unsqueeze(1), image_tiles], dim=1)
+                stacked_images = torch.cat(
+                    [thumbnails.unsqueeze(1), image_tiles], dim=1
+                )
 
             resized_images_grouped[shape] = stacked_images
-        resized_images = reorder_images(
-            resized_images_grouped, grouped_images_index
-        )
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
 
         grouped_images, grouped_images_index = group_images_by_shape(resized_images)
         processed_images_grouped = {}
@@ -317,9 +321,7 @@ def _preprocess(
         )
 
         processed_images = (
-            torch.stack(processed_images, dim=0)
-            if return_tensors
-            else processed_images
+            torch.stack(processed_images, dim=0) if return_tensors else processed_images
         )
         return BatchFeature(
             data={"pixel_values": processed_images}, tensor_type=return_tensors
diff --git a/tests/models/perception_lm/test_image_processing_perception_lm.py b/tests/models/perception_lm/test_image_processing_perception_lm.py
index a8d609367740..1563456a9f55 100644
--- a/tests/models/perception_lm/test_image_processing_perception_lm.py
+++ b/tests/models/perception_lm/test_image_processing_perception_lm.py
@@ -106,7 +106,6 @@ class PerceptionLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCas
     fast_image_processing_class = PerceptionLMImageProcessorFast if is_torchvision_available() else None
     test_slow_image_processor = False
 
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaNext
     def setUp(self):
         super().setUp()
         self.image_processor_tester = PerceptionLMImageProcessingTester(self)

From 836f5467cd4a18fd9568a43ebe8e67e47e5dd0a2 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 06:42:29 +0000
Subject: [PATCH 40/65] CI and doc fix

---
 .../convert_perception_lm_weights_to_hf.py    |  6 +++
 .../perception_lm/modeling_perception_lm.py   | 48 ++++++++++++-------
 .../perception_lm/modular_perception_lm.py    | 48 ++++++++++++-------
 .../perception_lm/processing_perception_lm.py | 13 +----
 4 files changed, 72 insertions(+), 43 deletions(-)

diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 556293c479bc..07e732a7a755 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -560,6 +560,12 @@ def write_tokenizer(
         "tile_size": tile_size,
         "max_num_tiles": params["data"]["max_num_tiles"],
         "max_frame_tiles": 1,
+        "size": {"height": tile_size, "width": tile_size},
+        "do_resize": True,
+        "do_rescale": True,
+        "do_normalize": True,
+        "image_mean": [0.5, 0.5, 0.5],
+        "image_std": [0.5, 0.5, 0.5],
     }
     image_preprocessor = PerceptionLMImageProcessorFast(**image_preprocessor_config)
     video_preprocessor_config = {
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index a1ef54afa59f..06fa3bb23b94 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -237,23 +237,39 @@ def forward(
         **lm_kwargs,
     ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-            logits_to_keep (`int` or `torch.Tensor`, *optional*):
-                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-                This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-
-        Returns:
-
+            Args:
+                input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                    Indices of input sequence tokens in the vocabulary.
+                pixel_values (`torch.FloatTensor` of shape `(batch_size, num_tiles, channels, height, width)`, *optional*):
+                    Pixel values for input images.
+                pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, channels, height, width)`, *optional*):
+                    Pixel values for input videos.
+                attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                    Mask to avoid performing attention on padding token indices.
+                position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                    Indices of positions of each input sequence token in the position embeddings.
+                past_key_values (`List[torch.FloatTensor]`, *optional*):
+                    List of precomputed key and value hidden states for each layer, used for fast autoregressive generation.
+                inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                    Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
+                use_cache (`bool`, *optional*):
+                    Whether or not to use past key values to speed up decoding.
+                output_attentions (`bool`, *optional*):
+                    Whether or not to return the attentions tensors of all attention layers.
+                output_hidden_states (`bool`, *optional*):
+                    Whether or not to return the hidden states of all layers.
+                return_dict (`bool`, *optional*):
+                    Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+                cache_position (`torch.LongTensor`, *optional*):
+                    Position indices for cached key/value states, used for efficient generation.
+                logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                    If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                    `input_ids` (special case). If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the
+                    sequence length dimension.
+                lm_kwargs:
+                    Additional keyword arguments passed to the language model.
         Example:
-
+        (TODO: fix example)
         ```python
         >>> from PIL import Image
         >>> import requests
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 46e02f40ffa8..26e4dbe3148b 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -197,23 +197,39 @@ def forward(
         **lm_kwargs,
     ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-            logits_to_keep (`int` or `torch.Tensor`, *optional*):
-                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-                This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-
-        Returns:
-
+            Args:
+                input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                    Indices of input sequence tokens in the vocabulary.
+                pixel_values (`torch.FloatTensor` of shape `(batch_size, num_tiles, channels, height, width)`, *optional*):
+                    Pixel values for input images.
+                pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, channels, height, width)`, *optional*):
+                    Pixel values for input videos.
+                attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                    Mask to avoid performing attention on padding token indices.
+                position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                    Indices of positions of each input sequence token in the position embeddings.
+                past_key_values (`List[torch.FloatTensor]`, *optional*):
+                    List of precomputed key and value hidden states for each layer, used for fast autoregressive generation.
+                inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                    Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
+                use_cache (`bool`, *optional*):
+                    Whether or not to use past key values to speed up decoding.
+                output_attentions (`bool`, *optional*):
+                    Whether or not to return the attentions tensors of all attention layers.
+                output_hidden_states (`bool`, *optional*):
+                    Whether or not to return the hidden states of all layers.
+                return_dict (`bool`, *optional*):
+                    Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+                cache_position (`torch.LongTensor`, *optional*):
+                    Position indices for cached key/value states, used for efficient generation.
+                logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                    If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                    `input_ids` (special case). If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the
+                    sequence length dimension.
+                lm_kwargs:
+                    Additional keyword arguments passed to the language model.
         Example:
-
+        (TODO: fix example)
         ```python
         >>> from PIL import Image
         >>> import requests
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index bfd0422f953d..749423cdc21d 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -19,9 +19,9 @@
 from typing import List, Union, Iterable
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput, PILImageResampling, get_image_size, to_numpy_array, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
 from ...video_utils import VideoInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_torch_available, logging
 
@@ -36,15 +36,6 @@ class PerceptionLMProcessorKwargs(ProcessingKwargs, total=False):
         "text_kwargs": {
             "padding": False,
         },
-        "images_kwargs": {
-            "do_resize": True,
-            "do_rescale": True,
-            "do_normalize": True,
-            "size": 448,
-            "resample": PILImageResampling.BICUBIC,
-            "image_mean": IMAGENET_STANDARD_MEAN,
-            "image_std": IMAGENET_STANDARD_STD,
-        },
     }
 
 

From b9e5fa010805d86426c30d95c5977a31a8e4898f Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 06:45:14 +0000
Subject: [PATCH 41/65] CI formatting fix

---
 tests/models/perception_lm/test_modeling_perception_lm.py | 8 --------
 .../models/perception_lm/test_processor_perception_lm.py  | 2 +-
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index ff07118550ff..8ba0ea664353 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -18,17 +18,14 @@
 from huggingface_hub import hf_hub_download
 from transformers import (
     AutoProcessor,
-    AutoTokenizer,
     PerceptionLMConfig,
     PerceptionLMForConditionalGeneration,
     is_torch_available,
-    is_vision_available,
 )
 from transformers.testing_utils import (
     cleanup,
     require_bitsandbytes,
     require_torch,
-    require_vision,
     slow,
     torch_device,
 )
@@ -41,11 +38,6 @@
 if is_torch_available():
     import torch
 
-
-if is_vision_available():
-    from PIL import Image
-
-
 class PerceptionLMVisionText2TextModelTester:
     def __init__(
         self,
diff --git a/tests/models/perception_lm/test_processor_perception_lm.py b/tests/models/perception_lm/test_processor_perception_lm.py
index df14c82d6bcc..4d1667211596 100644
--- a/tests/models/perception_lm/test_processor_perception_lm.py
+++ b/tests/models/perception_lm/test_processor_perception_lm.py
@@ -148,4 +148,4 @@ def test_image_token_filling(self):
     "{%- if add_generation_prompt %}"
     "{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}"
     "{%- endif %}"
-)
\ No newline at end of file
+)

From 6b69945ca21d060ab2281a4fecfce8e3eabe220c Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 06:50:46 +0000
Subject: [PATCH 42/65] ruff fix

---
 .../configuration_perception_lm.py            |  3 +-
 .../convert_perception_lm_weights_to_hf.py    | 15 +++-----
 .../image_processing_perception_lm_fast.py    | 21 ++++++-----
 .../perception_lm/modular_perception_lm.py    | 37 +++++++------------
 .../perception_lm/processing_perception_lm.py | 12 +++---
 .../video_processing_perception_lm.py         |  1 +
 .../test_image_processing_perception_lm.py    |  4 +-
 .../test_modeling_perception_lm.py            |  1 +
 8 files changed, 41 insertions(+), 53 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index 7578b5f9bfd5..cb4ca1fa62fe 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 """PerceptionLM model configuration"""
 
+
 from transformers.configuration_utils import PretrainedConfig
+
 from ...utils import logging
 from ..auto import CONFIG_MAPPING, AutoConfig
-from typing import Tuple
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 07e732a7a755..3ad058eb1d62 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -20,6 +20,7 @@
 from typing import List
 
 import torch
+from timm.models.eva import checkpoint_filter_fn
 from tokenizers import AddedToken, processors
 
 from transformers import (
@@ -30,15 +31,12 @@
 )
 from transformers.convert_slow_tokenizer import TikTokenConverter
 from transformers.models.perception_lm.configuration_perception_lm import (
-    PerceptionLMConfig,
     PerceptionEncoderConfig,
+    PerceptionLMConfig,
 )
 from transformers.models.perception_lm.image_processing_perception_lm_fast import (
     PerceptionLMImageProcessorFast,
 )
-from transformers.models.perception_lm.video_processing_perception_lm import (
-    PerceptionLMVideoProcessor,
-)
 from transformers.models.perception_lm.modeling_perception_lm import (
     PerceptionEncoder,
     PerceptionLMForConditionalGeneration,
@@ -46,7 +44,9 @@
 from transformers.models.perception_lm.processing_perception_lm import (
     PerceptionLMProcessor,
 )
-from timm.models.eva import checkpoint_filter_fn
+from transformers.models.perception_lm.video_processing_perception_lm import (
+    PerceptionLMVideoProcessor,
+)
 
 
 try:
@@ -210,7 +210,6 @@ def write_model(
     model_params = params.get("model", params)
     n_layers = model_params["n_layers"]
     n_heads = model_params["n_heads"]
-    n_heads_per_shard = n_heads // num_shards
     dim = model_params["dim"]
     dims_per_head = dim // n_heads
     base = model_params.get("rope_theta", 10000.0)
@@ -224,11 +223,9 @@ def write_model(
 
     if model_params.get("n_kv_heads", None) is not None:
         num_key_value_heads = model_params["n_kv_heads"]  # for GQA / MQA
-        num_key_value_heads_per_shard = num_key_value_heads // num_shards
         key_value_dim = dims_per_head * num_key_value_heads
     else:  # compatibility with other checkpoints
         num_key_value_heads = n_heads
-        num_key_value_heads_per_shard = n_heads_per_shard
         key_value_dim = dim
 
     # permute for sliced rotary
@@ -348,7 +345,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             architecture = "vit_pe_core_gigantic_patch14_448"
         else:
             raise ValueError(f"Unsupported PE config: {vision_params['layers']} layers and {vision_params['width']} width")
-        
+
         vision_config = PerceptionEncoderConfig(
             use_cls_token=vision_params["use_cls_token"],
             width=vision_params["width"],
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index b13299edd0ec..31efe4879e2a 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -12,11 +12,11 @@
 # limitations under the License.
 """Fast Image processor class for PerceptionLM."""
 
+import math
+from functools import reduce
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
-import math
-from functools import reduce
 
 from ...image_processing_utils import (
     BatchFeature,
@@ -24,9 +24,15 @@
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     DefaultFastImageProcessorKwargs,
+    get_image_size,
     group_images_by_shape,
     reorder_images,
-    get_image_size,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    PILImageResampling,
 )
 from ...processing_utils import Unpack
 from ...utils import (
@@ -35,12 +41,7 @@
     is_torch_available,
     is_torchvision_available,
 )
-from ...image_utils import (
-    IMAGENET_STANDARD_MEAN,
-    IMAGENET_STANDARD_STD,
-    ChannelDimension,
-    PILImageResampling,
-)
+
 
 if is_torch_available():
     import torch
@@ -212,7 +213,7 @@ def _find_closest_aspect_ratio(
         """
         tgt_ar = img_width / img_height
         asp_dict = self._find_supported_aspect_ratios()
-        cl_d, cl_p = 1e23, None
+        cl_p = None
         if tgt_ar >= 1:
             cl_p = min(
                 [k for k in asp_dict.keys() if k <= tgt_ar],
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 26e4dbe3148b..53aa2f185a83 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -13,45 +13,34 @@
 # limitations under the License.
 """PyTorch PerceptionLM model."""
 
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
 import math
+from typing import List, Optional, Tuple, Union
 
 import timm
-
 import torch
+import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
-import torch.nn.functional as F
 
 from transformers.generation.utils import GenerationMixin
-from ..llava.modeling_llava import (
-    LlavaModel,
-    LlavaPreTrainedModel,
-    LlavaCausalLMOutputWithPast as PerceptionLMCausalLMOutputWithPast,
-)
-from transformers.configuration_utils import PretrainedConfig
-from ..auto import CONFIG_MAPPING, AutoConfig
-
-
-from ...activations import ACT2FN
 
 # from ...generation import GenerationMixin
-from ...modeling_outputs import ModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
     auto_docstring,
+    logging,
 )
-
-from ..auto import AutoModelForCausalLM
 from ..auto import AutoModel
-
+from ..llava.modeling_llava import (
+    LlavaCausalLMOutputWithPast as PerceptionLMCausalLMOutputWithPast,
+)
+from ..llava.modeling_llava import (
+    LlavaModel,
+    LlavaPreTrainedModel,
+)
 from .configuration_perception_lm import PerceptionEncoderConfig, PerceptionLMConfig
 
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "PerceptionLMConfig"
@@ -344,10 +333,10 @@ def set_output_embeddings(self, new_embeddings):
 
     def get_output_embeddings(self):
         return self.lm_head
-    
+
     def set_decoder(self, decoder):
         self.model = decoder
-    
+
     def get_decoder(self):
         return self.model
 
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 749423cdc21d..a0db9b4d73d5 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -16,17 +16,15 @@
 Processor class for PerceptionLM.
 """
 
-from typing import List, Union, Iterable
+from typing import Iterable, List, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, get_image_size, to_numpy_array
-from ...video_utils import VideoInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import is_torch_available, logging
+from ...utils import logging
+from ...video_utils import VideoInput
 
-if is_torch_available():
-    import torch
 
 logger = logging.get_logger(__name__)
 
@@ -151,7 +149,7 @@ def __call__(
 
         pixel_values = iter(image_inputs.get("pixel_values", []))
         pixel_values_videos = iter(videos_inputs.get("pixel_values_videos", []))
-        for sample in text:       
+        for sample in text:
             # Replace the media token with the expanded media token sequence
             sample = self._expand_media_tokens(sample, self.tokenizer.image_token, pixel_values)
             sample = self._expand_media_tokens(sample, self.tokenizer.video_token, pixel_values_videos)
@@ -161,7 +159,7 @@ def __call__(
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
         self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image", "video"])
         return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
-    
+
     def _expand_media_tokens(self, sample, media_token: str, media_iter: Iterable):
         media_count = sample.count(media_token)
         if media_count > 0:
diff --git a/src/transformers/models/perception_lm/video_processing_perception_lm.py b/src/transformers/models/perception_lm/video_processing_perception_lm.py
index 9e6596e1c801..7381045c1d7c 100644
--- a/src/transformers/models/perception_lm/video_processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/video_processing_perception_lm.py
@@ -24,6 +24,7 @@
     BaseVideoProcessor,
 )
 
+
 if is_vision_available():
     from ...image_utils import PILImageResampling
 
diff --git a/tests/models/perception_lm/test_image_processing_perception_lm.py b/tests/models/perception_lm/test_image_processing_perception_lm.py
index 1563456a9f55..73e74ded3acd 100644
--- a/tests/models/perception_lm/test_image_processing_perception_lm.py
+++ b/tests/models/perception_lm/test_image_processing_perception_lm.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 
-from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ChannelDimension
+from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 
@@ -220,4 +220,4 @@ def test_nested_input(self):
             self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
 
             # Image processor should return same pixel values, independently of ipnut format
-            self.assertTrue((encoded_images_nested == encoded_images).all())
\ No newline at end of file
+            self.assertTrue((encoded_images_nested == encoded_images).all())
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index 8ba0ea664353..f9f7f6a96340 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -16,6 +16,7 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
+
 from transformers import (
     AutoProcessor,
     PerceptionLMConfig,

From 6c820123bd08a03dd9591f8e91f1b609741e6c64 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 06:53:14 +0000
Subject: [PATCH 43/65] ruff formatting

---
 .../configuration_perception_lm.py            |  4 +-
 .../convert_perception_lm_weights_to_hf.py    | 83 +++++--------------
 .../image_processing_perception_lm_fast.py    | 62 ++++----------
 .../perception_lm/modular_perception_lm.py    | 60 ++++----------
 .../perception_lm/processing_perception_lm.py | 12 ++-
 .../test_image_processing_perception_lm.py    | 13 +--
 .../test_modeling_perception_lm.py            | 42 +++-------
 .../test_processor_perception_lm.py           | 14 +---
 .../test_video_processing_perception_lm.py    |  2 +-
 9 files changed, 86 insertions(+), 206 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index cb4ca1fa62fe..a6e39713e65e 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """PerceptionLM model configuration"""
 
-
 from transformers.configuration_utils import PretrainedConfig
 
 from ...utils import logging
@@ -70,7 +69,9 @@ class PerceptionEncoderConfig(PretrainedConfig):
     >>> configuration = model.config
     ```
     """
+
     model_type = "perception_encoder"
+
     def __init__(
         self,
         use_cls_token=True,
@@ -97,6 +98,7 @@ def __init__(
         self.init_values = init_values
         self.ref_feat_shape = ref_feat_shape
 
+
 class PerceptionLMConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`PerceptionLMForConditionalGeneration`]. It is used to instantiate an
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 3ad058eb1d62..f7bf26dd1dc6 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -171,9 +171,7 @@
 
 
 def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * (
-        (int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of
-    )
+    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
 
 
 def read_json(path):
@@ -213,9 +211,7 @@ def write_model(
     dim = model_params["dim"]
     dims_per_head = dim // n_heads
     base = model_params.get("rope_theta", 10000.0)
-    inv_freq = 1.0 / (
-        base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
-    )
+    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
     context_length = model_params["max_seqlen"]
     max_position_embeddings = context_length
     tie_word_embeddings = model_params.get("weight_tying", False)
@@ -230,11 +226,7 @@ def write_model(
 
     # permute for sliced rotary
     def permute(w, n_heads, dim1=dim, dim2=dim):
-        return (
-            w.view(n_heads, dim1 // n_heads // 2, 2, dim2)
-            .transpose(1, 2)
-            .reshape(dim1, dim2)
-        )
+        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
 
     with tempfile.TemporaryDirectory() as tmp_model_path:
         print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
@@ -249,9 +241,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             )
         else:
             # Sharded
-            checkpoint_list = sorted(
-                [file for file in os.listdir(input_base_path) if file.endswith(".pth")]
-            )
+            checkpoint_list = sorted([file for file in os.listdir(input_base_path) if file.endswith(".pth")])
             print("Loading in order:", checkpoint_list)
             loaded = [
                 torch.load(
@@ -297,9 +287,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
                     f"layers.{layer_i}.ffn_norm.weight"
                 ],
             }
-            state_dict[
-                f"model.language_model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"
-            ] = inv_freq
+            state_dict[f"model.language_model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
             for k, v in state_dict.items():
                 index_dict["weight_map"][k] = filename
                 param_count += v.numel()
@@ -311,18 +299,10 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         state_dict = {
             "model.language_model.embed_tokens.weight": loaded["tok_embeddings.weight"],
             "model.language_model.norm.weight": loaded["norm.weight"],
-            "model.multi_modal_projector.projector.0.weight": loaded[
-                "vision_projector.projector.0.weight"
-            ],
-            "model.multi_modal_projector.projector.2.weight": loaded[
-                "vision_projector.projector.2.weight"
-            ],
-            "model.multi_modal_projector.projector.0.bias": loaded[
-                "vision_projector.projector.0.bias"
-            ],
-            "model.multi_modal_projector.projector.2.bias": loaded[
-                "vision_projector.projector.2.bias"
-            ],
+            "model.multi_modal_projector.projector.0.weight": loaded["vision_projector.projector.0.weight"],
+            "model.multi_modal_projector.projector.2.weight": loaded["vision_projector.projector.2.weight"],
+            "model.multi_modal_projector.projector.0.bias": loaded["vision_projector.projector.0.bias"],
+            "model.multi_modal_projector.projector.2.bias": loaded["vision_projector.projector.2.bias"],
         }
         if not tie_word_embeddings:
             state_dict["lm_head.weight"] = loaded["output.weight"]
@@ -333,18 +313,16 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         print(f"Saved {filename}")
 
         filename = f"pytorch_model-{n_layers + 2}-of-{n_layers + 2}.bin"
-        state_dict = {
-            k.replace("vision_model.", ""): v
-            for k, v in loaded.items()
-            if "vision_model" in k
-        }
+        state_dict = {k.replace("vision_model.", ""): v for k, v in loaded.items() if "vision_model" in k}
         vision_params = model_params["vision_model"]
         if vision_params["layers"] == 23 and vision_params["width"] == 1024:
             architecture = "vit_pe_core_large_patch14_336"
         elif vision_params["layers"] == 47 and vision_params["width"] == 1536:
             architecture = "vit_pe_core_gigantic_patch14_448"
         else:
-            raise ValueError(f"Unsupported PE config: {vision_params['layers']} layers and {vision_params['width']} width")
+            raise ValueError(
+                f"Unsupported PE config: {vision_params['layers']} layers and {vision_params['width']} width"
+            )
 
         vision_config = PerceptionEncoderConfig(
             use_cls_token=vision_params["use_cls_token"],
@@ -372,23 +350,12 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
 
         # Write configs
         index_dict["metadata"] = {"total_size": param_count * 2}
-        write_json(
-            index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")
-        )
-        ffn_dim_multiplier = (
-            model_params["ffn_dim_multiplier"]
-            if "ffn_dim_multiplier" in model_params
-            else 1
-        )
-        multiple_of = (
-            model_params["multiple_of"] if "multiple_of" in model_params else 256
-        )
+        write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
+        ffn_dim_multiplier = model_params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in model_params else 1
+        multiple_of = model_params["multiple_of"] if "multiple_of" in model_params else 256
 
         bos_token_id = tokenizer.convert_tokens_to_ids("<|begin_of_text|>")
-        eos_token_id = [
-            tokenizer.convert_tokens_to_ids(t)
-            for t in ["<|end_of_text|>", "<|eot_id|>"]
-        ]
+        eos_token_id = [tokenizer.convert_tokens_to_ids(t) for t in ["<|end_of_text|>", "<|eot_id|>"]]
 
         use_scaled_rope = model_params["use_scaled_rope"]
         if use_scaled_rope:
@@ -404,9 +371,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
 
         text_config = LlamaConfig(
             hidden_size=dim,
-            intermediate_size=compute_intermediate_size(
-                dim, ffn_dim_multiplier, multiple_of
-            ),
+            intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
             num_attention_heads=model_params["n_heads"],
             num_hidden_layers=model_params["n_layers"],
             rms_norm_eps=model_params["norm_eps"],
@@ -531,9 +496,7 @@ def write_tokenizer(
     push_to_hub=False,
 ):
     print("Converting the tokenizer.")
-    tokenizer_class = (
-        LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
-    )
+    tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
     context_length = params["model"]["max_seqlen"]
     tokenizer = Llama3Converter(
         input_tokenizer_path,
@@ -541,9 +504,7 @@ def write_tokenizer(
         context_length,
     ).converted_tokenizer
 
-    tokenizer.image_token_id = tokenizer.encode(
-        tokenizer.image_token, add_special_tokens=False
-    )[0]
+    tokenizer.image_token_id = tokenizer.encode(tokenizer.image_token, add_special_tokens=False)[0]
     processor_config = {
         "pooling_ratio": params["model"]["pooling_ratio"],
         "patch_size": params["model"]["vision_model"]["patch_size"],
@@ -579,9 +540,7 @@ def write_tokenizer(
     )
 
     if push_to_hub:
-        print(
-            f"Pushing a {tokenizer_class.__name__} to the Hub repo - {tokenizer_path}."
-        )
+        print(f"Pushing a {tokenizer_class.__name__} to the Hub repo - {tokenizer_path}.")
         processor.push_to_hub(tokenizer_path, private=True, use_temp_dir=True)
     else:
         print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 31efe4879e2a..996105f26d92 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -126,9 +126,7 @@ def _get_image_height_width(
             # Width is larger than height
 
             # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
-            rescaling_factor = min(
-                target_width / image_width, target_height / image_height
-            )
+            rescaling_factor = min(target_width / image_width, target_height / image_height)
 
             # Set new width to target width and height to the rescaled height.
             new_w = rescaling_factor * image_width
@@ -138,9 +136,7 @@ def _get_image_height_width(
             # Height is larger than width
 
             # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
-            rescaling_factor = min(
-                target_width / image_width, target_height / image_height
-            )
+            rescaling_factor = min(target_width / image_width, target_height / image_height)
 
             # Set new height to target height and width to the rescaled width.
             new_h = rescaling_factor * image_height
@@ -163,9 +159,7 @@ def _fit_image_to_canvas(self, img_width: int, img_height: int, tile_size: int):
 
         # Gather all potential supported image resolutions and iterate through them to find best match
         potential_arrangements = [
-            item
-            for sublist in self._find_supported_aspect_ratios().values()
-            for item in sublist
+            item for sublist in self._find_supported_aspect_ratios().values() for item in sublist
         ]
         for n_w, n_h in potential_arrangements:
             # Compute the canvas size
@@ -193,20 +187,14 @@ def _fit_image_to_canvas(self, img_width: int, img_height: int, tile_size: int):
                         target_height=n_h * tile_size,
                     )
                     # Llama3V dynamic tiling. Priortize biggest canvas.
-                    if (
-                        scale < 1.0
-                        and (image_width_height[0] >= optimal_image_width_height[0])
-                    ) or (
-                        scale >= 1.0
-                        and (image_width_height[1] >= optimal_image_width_height[1])
+                    if (scale < 1.0 and (image_width_height[0] >= optimal_image_width_height[0])) or (
+                        scale >= 1.0 and (image_width_height[1] >= optimal_image_width_height[1])
                     ):
                         optimal_canvas = (n_w, n_h)
                         optimal_image_width_height = image_width_height
         return optimal_canvas
 
-    def _find_closest_aspect_ratio(
-        self, img_width: int, img_height: int, tile_size: int
-    ) -> Tuple:
+    def _find_closest_aspect_ratio(self, img_width: int, img_height: int, tile_size: int) -> Tuple:
         """
         Given an image width, height and target number of chunks
         this function will find the closest supported aspect ratio.
@@ -238,15 +226,11 @@ def _find_closest_aspect_ratio(
     def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
         # Split image into number of required tiles (width x height)
         batch_size, num_channels, height, width = image.size()
-        image = image.view(
-            batch_size, num_channels, nch, height // nch, ncw, width // ncw
-        )
+        image = image.view(batch_size, num_channels, nch, height // nch, ncw, width // ncw)
         # Permute dimensions to reorder the axes
         image = image.permute(0, 2, 4, 1, 3, 5).contiguous()
         # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
-        image = image.view(
-            batch_size, ncw * nch, num_channels, height // nch, width // ncw
-        )
+        image = image.view(batch_size, ncw * nch, num_channels, height // nch, width // ncw)
         return image
 
     def resize(
@@ -259,14 +243,10 @@ def resize(
     ):
         h, w = get_image_size(image, channel_dim=input_data_format)
         if max_num_tiles > 1:
-            ar = self._fit_image_to_canvas(
-                img_width=w, img_height=h, tile_size=tile_size
-            )
+            ar = self._fit_image_to_canvas(img_width=w, img_height=h, tile_size=tile_size)
             if ar is None:
                 # If we did not find a canvas, we have to find the closest aspect ratio and downsample the image
-                ar = self._find_closest_aspect_ratio(
-                    img_width=w, img_height=h, tile_size=tile_size
-                )
+                ar = self._find_closest_aspect_ratio(img_width=w, img_height=h, tile_size=tile_size)
         else:
             ar = (1, 1)
         new_w, new_h = ar[0] * tile_size, ar[1] * tile_size
@@ -283,23 +263,19 @@ def _preprocess(
         image_mean: Optional[Union[float, list[float]]],
         image_std: Optional[Union[float, list[float]]],
         return_tensors: Optional[Union[str, TensorType]],
-        **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]
+        **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs],
     ) -> BatchFeature:
         # Group images by size for batched transformation
         grouped_images, grouped_images_index = group_images_by_shape(images)
         resized_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
             if do_resize:
-                thumbnails, _ = self.resize(
-                    stacked_images, self.tile_size, max_num_tiles=1
-                )
+                thumbnails, _ = self.resize(stacked_images, self.tile_size, max_num_tiles=1)
                 images_for_tiling, (tiles_w, tiles_h) = self.resize(
                     stacked_images, self.tile_size, max_num_tiles=self.max_num_tiles
                 )
                 image_tiles = self._split(images_for_tiling, tiles_w, tiles_h)
-                stacked_images = torch.cat(
-                    [thumbnails.unsqueeze(1), image_tiles], dim=1
-                )
+                stacked_images = torch.cat([thumbnails.unsqueeze(1), image_tiles], dim=1)
 
             resized_images_grouped[shape] = stacked_images
         resized_images = reorder_images(resized_images_grouped, grouped_images_index)
@@ -317,16 +293,10 @@ def _preprocess(
                 image_std,
             )
             processed_images_grouped[shape] = stacked_images
-        processed_images = reorder_images(
-            processed_images_grouped, grouped_images_index
-        )
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
 
-        processed_images = (
-            torch.stack(processed_images, dim=0) if return_tensors else processed_images
-        )
-        return BatchFeature(
-            data={"pixel_values": processed_images}, tensor_type=return_tensors
-        )
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
 
 
 __all__ = ["PerceptionLMImageProcessorFast"]
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 53aa2f185a83..e98a773acb61 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -82,9 +82,7 @@ def forward(self, hidden_states):
         b, num_tokens, c = hidden_states.shape
         h = int(math.sqrt(num_tokens))
         if h * h != num_tokens:
-            raise ValueError(
-                f"num_tokens {num_tokens} is expected to be a square number"
-            )
+            raise ValueError(f"num_tokens {num_tokens} is expected to be a square number")
 
         shape = (h // self.pooling_ratio, h // self.pooling_ratio)
         hidden_states = hidden_states.permute(0, 2, 1).reshape(b, -1, h, h)
@@ -115,9 +113,7 @@ def __init__(self, config: PerceptionLMConfig):
             ]
         )
         self.pooling = (
-            AdaptiveAvgPooling(config.projector_pooling_ratio)
-            if config.projector_pooling_ratio > 1
-            else nn.Identity()
+            AdaptiveAvgPooling(config.projector_pooling_ratio) if config.projector_pooling_ratio > 1 else nn.Identity()
         )
 
     def forward(self, features):
@@ -239,26 +235,14 @@ def forward(
         "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
         ```"""
 
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You must specify exactly one of input_ids or inputs_embeds"
-            )
-        if (
-            pixel_values is not None or pixel_values_videos is not None
-        ) and inputs_embeds is not None:
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
                 "You cannot specify both (pixel_values or pixel_values_videos) and inputs_embeds at the same time, and must specify either one"
             )
@@ -273,13 +257,9 @@ def forward(
             )
             special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
             self.check_mask_feature_size_match(special_image_mask, image_features)
-            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
-                inputs_embeds.device
-            )
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             image_features = image_features.to(inputs_embeds)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                special_image_mask, image_features
-            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         if pixel_values_videos is not None:
             video_features = self.get_image_features(
@@ -287,13 +267,9 @@ def forward(
             )
             special_video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1)
             self.check_mask_feature_size_match(special_video_mask, video_features)
-            special_video_mask = special_video_mask.expand_as(inputs_embeds).to(
-                inputs_embeds.device
-            )
+            special_video_mask = special_video_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             video_features = video_features.to(inputs_embeds)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                special_video_mask, video_features
-            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -312,17 +288,13 @@ def forward(
 
 
 @auto_docstring
-class PerceptionLMForConditionalGeneration(
-    PerceptionLMPreTrainedModel, GenerationMixin
-):
+class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         super().__init__(config, **super_kwargs)
         self.model = PerceptionLMModel(config)
-        self.lm_head = nn.Linear(
-            config.text_config.hidden_size, config.text_config.vocab_size, bias=False
-        )
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.post_init()
 
     def get_input_embeddings(self):
@@ -437,11 +409,7 @@ def forward(
 
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        slice_indices = (
-            slice(-logits_to_keep, None)
-            if isinstance(logits_to_keep, int)
-            else logits_to_keep
-        )
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
 
         loss = None
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index a0db9b4d73d5..c52995bde05d 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -122,7 +122,9 @@ def __call__(
             - **pixel_values_videos** -- Video pixel values to be fed to a model. Returned when `videos` is provided.
         """
         if text is None:
-            raise ValueError("You have to specify at least `text` input. Optionally, you can also specify `images` or `videos`.")
+            raise ValueError(
+                "You have to specify at least `text` input. Optionally, you can also specify `images` or `videos`."
+            )
 
         output_kwargs = self._merge_kwargs(
             PerceptionLMProcessorKwargs,
@@ -169,9 +171,11 @@ def _expand_media_tokens(self, sample, media_token: str, media_iter: Iterable):
             for media in media_list:
                 height, width = get_image_size(to_numpy_array(media))
                 num_tiles = media.shape[0]
-                num_media_tokens = (height // self.patch_size // self.pooling_ratio) * (
-                    width // self.patch_size // self.pooling_ratio
-                ) * num_tiles
+                num_media_tokens = (
+                    (height // self.patch_size // self.pooling_ratio)
+                    * (width // self.patch_size // self.pooling_ratio)
+                    * num_tiles
+                )
                 media_token_list.append(num_media_tokens)
             sample = ""
             for i, num_media_tokens in enumerate(media_token_list):
diff --git a/tests/models/perception_lm/test_image_processing_perception_lm.py b/tests/models/perception_lm/test_image_processing_perception_lm.py
index 73e74ded3acd..8d6d95e89dc0 100644
--- a/tests/models/perception_lm/test_image_processing_perception_lm.py
+++ b/tests/models/perception_lm/test_image_processing_perception_lm.py
@@ -32,6 +32,7 @@
     if is_torchvision_available():
         from transformers import PerceptionLMImageProcessorFast
 
+
 class PerceptionLMImageProcessingTester:
     def __init__(
         self,
@@ -49,8 +50,8 @@ def __init__(
         do_convert_rgb=True,
         max_num_tiles=4,
         vision_input_type="thumb+tile",
-        resample=Image.Resampling.BICUBIC, # dummy value
-        size = {"shortest_edge": 20}, # dummy value
+        resample=Image.Resampling.BICUBIC,  # dummy value
+        size={"shortest_edge": 20},  # dummy value
     ):
         super().__init__()
         self.parent = parent
@@ -134,7 +135,9 @@ def test_image_processor_from_dict_with_kwargs(self):
             self.assertEqual(image_processor.max_num_tiles, 4)
             self.assertEqual(image_processor.vision_input_type, "thumb+tile")
 
-            image_processor = image_processing_class.from_dict(self.image_processor_dict, tile_size=42, max_num_tiles=9)
+            image_processor = image_processing_class.from_dict(
+                self.image_processor_dict, tile_size=42, max_num_tiles=9
+            )
             self.assertEqual(image_processor.tile_size, 42)
             self.assertEqual(image_processor.max_num_tiles, 9)
             self.assertEqual(image_processor.vision_input_type, "thumb+tile")
@@ -197,9 +200,7 @@ def test_call_pytorch(self):
             expected_output_image_shape = (7, 5, 3, 16, 16)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
 
-    @unittest.skip(
-        reason="PerceptionLMImageProcessor doesn't treat 4 channel PIL and numpy consistently yet"
-    )
+    @unittest.skip(reason="PerceptionLMImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")
     def test_call_numpy_4_channels(self):
         pass
 
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index f9f7f6a96340..256380dbf510 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -39,6 +39,7 @@
 if is_torch_available():
     import torch
 
+
 class PerceptionLMVisionText2TextModelTester:
     def __init__(
         self,
@@ -103,10 +104,8 @@ def __init__(
         self.num_tiles = 1
         self.num_frames = 1
         self.num_channels = 3
-        self.image_size =  self.vision_config["img_size"][0]
-        self.num_image_tokens = (
-            self.vision_config["img_size"][0] // 14
-        ) ** 2
+        self.image_size = self.vision_config["img_size"][0]
+        self.num_image_tokens = (self.vision_config["img_size"][0] // 14) ** 2
         self.seq_length = seq_length + self.num_image_tokens
         self.encoder_seq_length = self.seq_length
 
@@ -144,12 +143,7 @@ def prepare_config_and_inputs(self):
 
     def prepare_config_and_inputs_for_common(self):
         config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs()
-        input_ids = (
-            ids_tensor(
-                [self.batch_size, self.seq_length], config.text_config.vocab_size - 2
-            )
-            + 2
-        )
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
         input_ids[input_ids == config.image_token_id] = self.pad_token_id
         input_ids[input_ids == config.video_token_id] = self.pad_token_id
@@ -168,16 +162,12 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class PerceptionLMForConditionalGenerationModelTest(
-    ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
-):
+class PerceptionLMForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     """
     Model tester for `PerceptionLMForConditionalGeneration`.
     """
 
-    all_model_classes = (
-        (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
-    )
+    all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
     _is_composite = True
@@ -270,7 +260,6 @@ def test_mismatching_num_image_tokens(self):
             pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
             _ = model(input_ids=input_ids, pixel_values=pixel_values)
 
-
     @unittest.skip(
         reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
@@ -297,6 +286,8 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
 
 
 TEST_MODEL_PATH = "shumingh/plm_1b_hf"
+
+
 @require_torch
 class PerceptionLMForConditionalGenerationIntegrationTest(unittest.TestCase):
     def setUp(self):
@@ -362,18 +353,14 @@ def test_small_model_integration_test(self):
         EXPECTED_DECODED_TEXT = "The bar plot displays the values of four categories: step, horror, mood, and lumber"  # fmt: skip
 
         self.assertEqual(
-            self.processor.decode(
-                generate_ids_without_inputs[0], skip_special_tokens=True
-            ),
+            self.processor.decode(generate_ids_without_inputs[0], skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,
         )
 
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_batched(self):
-        model = PerceptionLMForConditionalGeneration.from_pretrained(
-            TEST_MODEL_PATH, load_in_4bit=True
-        )
+        model = PerceptionLMForConditionalGeneration.from_pretrained(TEST_MODEL_PATH, load_in_4bit=True)
         processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
         inputs = processor.apply_chat_template(
             [self.conversation1, self.conversation2],
@@ -394,9 +381,7 @@ def test_small_model_integration_test_batched(self):
         EXPECTED_DECODED_TEXT = ['The bar plot displays the values of four categories: step, horror, mood, and lumber', 'The video shows a group of people in green shirts and white shorts performing a jump rope routine']  # fmt: skip
 
         self.assertEqual(
-            processor.batch_decode(
-                generate_ids_without_inputs, skip_special_tokens=True
-            ),
+            processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,
         )
 
@@ -404,9 +389,7 @@ def test_small_model_integration_test_batched(self):
     @require_bitsandbytes
     def test_generation_no_images(self):
         # model_id = "facebook/Perception-LM-1B"
-        model = PerceptionLMForConditionalGeneration.from_pretrained(
-            TEST_MODEL_PATH, load_in_4bit=True
-        )
+        model = PerceptionLMForConditionalGeneration.from_pretrained(TEST_MODEL_PATH, load_in_4bit=True)
         processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
 
         # Prepare inputs with no images
@@ -414,4 +397,3 @@ def test_generation_no_images(self):
 
         # Make sure that `generate` works
         _ = model.generate(**inputs, max_new_tokens=20)
-
diff --git a/tests/models/perception_lm/test_processor_perception_lm.py b/tests/models/perception_lm/test_processor_perception_lm.py
index 4d1667211596..0d9f6b4162f6 100644
--- a/tests/models/perception_lm/test_processor_perception_lm.py
+++ b/tests/models/perception_lm/test_processor_perception_lm.py
@@ -36,6 +36,7 @@
 
 TEST_MODEL_PATH = "shumingh/plm_1b_hf"
 
+
 @require_vision
 class PerceptionLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = PerceptionLMProcessor
@@ -49,15 +50,10 @@ def setUpClass(cls):
         )
         video_processor = PerceptionLMVideoProcessor()
         tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL_PATH)
-        tokenizer.add_special_tokens(
-            {"additional_special_tokens": ["<|image|>", "<|video|>"]}
-        )
+        tokenizer.add_special_tokens({"additional_special_tokens": ["<|image|>", "<|video|>"]})
         processor_kwargs = cls.prepare_processor_dict()
         processor = PerceptionLMProcessor(
-            image_processor=image_processor,
-            video_processor=video_processor,
-            tokenizer=tokenizer,
-            **processor_kwargs
+            image_processor=image_processor, video_processor=video_processor, tokenizer=tokenizer, **processor_kwargs
         )
         processor.save_pretrained(cls.tmpdirname)
         cls.image_token_id = processor.image_token_id
@@ -90,9 +86,7 @@ def test_chat_template_is_saved(self):
         # they have to be saved as separate file and loaded back from that file
         # so we check if the same template is loaded
         processor_dict = self.prepare_processor_dict()
-        self.assertTrue(
-            processor_loaded.chat_template == processor_dict.get("chat_template", None)
-        )
+        self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
 
     def test_image_token_filling(self):
         processor = self.processor_class.from_pretrained(self.tmpdirname)
diff --git a/tests/models/perception_lm/test_video_processing_perception_lm.py b/tests/models/perception_lm/test_video_processing_perception_lm.py
index 74d3cdb76b3a..f411bc8bc85c 100644
--- a/tests/models/perception_lm/test_video_processing_perception_lm.py
+++ b/tests/models/perception_lm/test_video_processing_perception_lm.py
@@ -123,5 +123,5 @@ def test_video_processor_from_dict_with_kwargs(self):
         self.assertEqual(video_processor.crop_size, {"height": 18, "width": 18})
 
         video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42, crop_size=84)
-        self.assertEqual(video_processor.size, {'height': 42, 'width': 42})
+        self.assertEqual(video_processor.size, {"height": 42, "width": 42})
         self.assertEqual(video_processor.crop_size, {"height": 84, "width": 84})

From ed1dd4b8ba2addd213d747130eb012e00a69c5b1 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 06:56:18 +0000
Subject: [PATCH 44/65] ran utils/sort_auto_mappings.py

---
 src/transformers/models/auto/configuration_auto.py    | 4 ++--
 src/transformers/models/auto/image_processing_auto.py | 2 +-
 src/transformers/models/auto/modeling_auto.py         | 2 +-
 src/transformers/models/auto/processing_auto.py       | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index e8eea7733680..012d7773bc8d 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -203,7 +203,6 @@
         ("llama4", "Llama4Config"),
         ("llama4_text", "Llama4TextConfig"),
         ("llava", "LlavaConfig"),
-        ("perception_lm", "PerceptionLMConfig"),
         ("llava_next", "LlavaNextConfig"),
         ("llava_next_video", "LlavaNextVideoConfig"),
         ("llava_onevision", "LlavaOnevisionConfig"),
@@ -269,6 +268,7 @@
         ("pegasus_x", "PegasusXConfig"),
         ("perceiver", "PerceiverConfig"),
         ("perception_encoder", "PerceptionEncoderConfig"),
+        ("perception_lm", "PerceptionLMConfig"),
         ("persimmon", "PersimmonConfig"),
         ("phi", "PhiConfig"),
         ("phi3", "Phi3Config"),
@@ -593,7 +593,6 @@
         ("llama4", "Llama4"),
         ("llama4_text", "Llama4ForCausalLM"),
         ("llava", "LLaVa"),
-        ("perception_lm", "PerceptionLM"),
         ("llava_next", "LLaVA-NeXT"),
         ("llava_next_video", "LLaVa-NeXT-Video"),
         ("llava_onevision", "LLaVA-Onevision"),
@@ -667,6 +666,7 @@
         ("pegasus_x", "PEGASUS-X"),
         ("perceiver", "Perceiver"),
         ("perception_encoder", "PerceptionEncoder"),
+        ("perception_lm", "PerceptionLM"),
         ("persimmon", "Persimmon"),
         ("phi", "Phi"),
         ("phi3", "Phi3"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index a8c7a6a198e6..e730eb179841 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -132,6 +132,7 @@
             ("owlvit", ("OwlViTImageProcessor", "OwlViTImageProcessorFast")),
             ("paligemma", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
             ("perceiver", ("PerceiverImageProcessor", "PerceiverImageProcessorFast")),
+            ("perception_lm", ("PerceptionLMImageProcessorFast",)),
             ("phi4_multimodal", ("Phi4MultimodalImageProcessorFast",)),
             ("pix2struct", ("Pix2StructImageProcessor",)),
             ("pixtral", ("PixtralImageProcessor", "PixtralImageProcessorFast")),
@@ -176,7 +177,6 @@
             ("xclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("yolos", ("YolosImageProcessor", "YolosImageProcessorFast")),
             ("zoedepth", ("ZoeDepthImageProcessor", "ZoeDepthImageProcessorFast")),
-            ("perception_lm", ("PerceptionLMImageProcessorFast",)),
         ]
     )
 
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 9669185e3775..0390720947db 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -928,13 +928,13 @@
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
         ("llama4", "Llama4ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
-        ("perception_lm", "PerceptionLMForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
         ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
         ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
         ("mistral3", "Mistral3ForConditionalGeneration"),
         ("mllama", "MllamaForConditionalGeneration"),
         ("paligemma", "PaliGemmaForConditionalGeneration"),
+        ("perception_lm", "PerceptionLMForConditionalGeneration"),
         ("pix2struct", "Pix2StructForConditionalGeneration"),
         ("pixtral", "LlavaForConditionalGeneration"),
         ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 151b0b66a3c7..3d8c54e0d6f8 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -87,7 +87,6 @@
         ("layoutlmv3", "LayoutLMv3Processor"),
         ("llama4", "Llama4Processor"),
         ("llava", "LlavaProcessor"),
-        ("perception_lm", "PerceptionLMProcessor"),
         ("llava_next", "LlavaNextProcessor"),
         ("llava_next_video", "LlavaNextVideoProcessor"),
         ("llava_onevision", "LlavaOnevisionProcessor"),
@@ -101,6 +100,7 @@
         ("owlv2", "Owlv2Processor"),
         ("owlvit", "OwlViTProcessor"),
         ("paligemma", "PaliGemmaProcessor"),
+        ("perception_lm", "PerceptionLMProcessor"),
         ("phi4_multimodal", "Phi4MultimodalProcessor"),
         ("pix2struct", "Pix2StructProcessor"),
         ("pixtral", "PixtralProcessor"),

From b77f53e6847959ca55d871b382cf8805f6a894b8 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 07:11:17 +0000
Subject: [PATCH 45/65] update docstring

---
 .../configuration_perception_lm.py            | 45 +++++++------------
 .../perception_lm/modeling_perception_lm.py   |  3 +-
 .../perception_lm/processing_perception_lm.py | 22 ++++-----
 3 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index a6e39713e65e..fac19af103e6 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -31,18 +31,16 @@ class PerceptionEncoderConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        model_type (`str`, *optional*, defaults to `"perception_encoder"`):
-            The type of the model.
         use_cls_token (`bool`, *optional*, defaults to `True`):
             Whether to use a CLS token.
         architecture (`str`, *optional*, defaults to `"vit_pe_core_large_patch14_336"`):
             The architecture of the model.
         width (`int`, *optional*, defaults to `1024`):
-            The width of the model.
-        img_size (`Tuple[int, int]`, *optional*, defaults to `(448, 448)`):
-            The size of the input image.
+            The width (hidden size) of the model.
+        img_size (`List[int]`, *optional*, defaults to `[448, 448]`):
+            The size of the input image as [height, width].
         depth (`int`, *optional*, defaults to `23`):
-            The depth of the model.
+            The number of layers in the model.
         num_classes (`int`, *optional*, defaults to `0`):
             The number of classes for classification.
         global_pool (`str`, *optional*, defaults to `""`):
@@ -50,9 +48,9 @@ class PerceptionEncoderConfig(PretrainedConfig):
         use_post_transformer_norm (`bool`, *optional*, defaults to `False`):
             Whether to use post-transformer normalization.
         init_values (`float`, *optional*, defaults to `0.1`):
-            The initialization values.
-        ref_feat_shape (`Tuple[int, int]`, *optional*, defaults to `(32, 32)`):
-            The shape of the reference feature.
+            The initialization value for LayerScale.
+        ref_feat_shape (`List[int]`, *optional*, defaults to `[32, 32]`):
+            The shape of the reference feature as [height, width].
 
     Example:
 
@@ -111,33 +109,24 @@ class PerceptionLMConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+        vision_config (`Union[PerceptionEncoderConfig, dict]`, *optional*, defaults to `PerceptionEncoderConfig()`):
             The config object or dictionary of the vision backbone.
-        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+        text_config (`Union[PretrainedConfig, dict]`, *optional*, defaults to `LlamaConfig()`):
             The config object or dictionary of the text backbone.
-        image_token_id (`int`, *optional*, defaults to 32000):
+        projector_pooling_ratio (`int`, *optional*, defaults to 1):
+            The pooling ratio used in the multimodal projector.
+        image_token_id (`int`, *optional*, defaults to 128002):
             The image token index to encode the image prompt.
-        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
-            The activation function used by the multimodal projector.
-        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Can be one of `"default"` or `"full"`.
-        vision_feature_layer (`Union[int, List[int]]`, *optional*, defaults to -2):
-            The index of the layer to select the vision feature. If multiple indices are provided,
-            the vision feature of the corresponding indices will be concatenated to form the
-            vision features.
-        image_seq_length (`int`, *optional*, defaults to 576):
-            Sequence length of one image embedding.
-        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
-            Whether to use bias in the multimodal projector.
+        video_token_id (`int`, *optional*, defaults to 128003):
+            The video token index to encode the video prompt.
 
     Example:
 
     ```python
-    >>> from transformers import PerceptionLMForConditionalGeneration, PerceptionLMConfig, CLIPVisionConfig, LlamaConfig
+    >>> from transformers import PerceptionLMForConditionalGeneration, PerceptionLMConfig, PerceptionEncoderConfig, LlamaConfig
 
-    >>> # Initializing a CLIP-vision config
-    >>> vision_config = CLIPVisionConfig()
+    >>> # Initializing a PerceptionEncoder config
+    >>> vision_config = PerceptionEncoderConfig()
 
     >>> # Initializing a Llama config
     >>> text_config = LlamaConfig()
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 06fa3bb23b94..b0bace060594 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -29,8 +29,9 @@
 
 from transformers.generation.utils import GenerationMixin
 
-# from ...generation import GenerationMixin
 from ...modeling_outputs import ModelOutput
+
+# from ...generation import GenerationMixin
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, can_return_tuple
 from ..auto import AutoModel
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index c52995bde05d..d06507af2c93 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -39,24 +39,24 @@ class PerceptionLMProcessorKwargs(ProcessingKwargs, total=False):
 
 class PerceptionLMProcessor(ProcessorMixin):
     r"""
-    Constructs a PerceptionLM processor which wraps a PerceptionLM image processor and a LLaMa tokenizer into a single processor.
+    Constructs a PerceptionLM processor which wraps a PerceptionLM image processor, a PerceptionLM video processor, and a tokenizer into a single processor.
 
-    [`PerceptionLMProcessor`] offers all the functionalities of [`PerceptionLMImageProcessor`] and [`PerceptionLMTokenizerFast`]. See the
+    [`PerceptionLMProcessor`] offers all the functionalities of [`PerceptionLMImageProcessorFast`], [`PerceptionLMVideoProcessor`], and the tokenizer (e.g. [`LlamaTokenizerFast`]). See the
     [`~PerceptionLMProcessor.__call__`] and [`~PerceptionLMProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`PerceptionLMImageProcessor`], *optional*):
-            The image processor is a required input.
         video_processor ([`PerceptionLMVideoProcessor`], *optional*):
-            The video processor is a required input.
-        tokenizer ([`PerceptionLMTokenizerFast`], *optional*):
-            The tokenizer is a required input.
+            The video processor to process video inputs.
+        image_processor ([`PerceptionLMImageProcessorFast`], *optional*):
+            The image processor to process image inputs.
+        tokenizer ([`LlamaTokenizerFast`] or similar, *optional*):
+            The tokenizer to process text inputs.
         patch_size (`int`, *optional*):
             Patch size from the vision tower.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        pooling_ratio (`int`, *optional*):
-            Pooling ratio for vision tokens. If not 1, we do 2D adaptive pooling over projected vision tokens.
+        chat_template (`str`, *optional*):
+            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
+        pooling_ratio (`int`, *optional*, defaults to 2):
+            Pooling ratio for vision tokens. If not 1, 2D adaptive pooling is applied over projected vision tokens.
     """
 
     attributes = ["video_processor", "image_processor", "tokenizer"]

From eebcc7a61e9ccf85b62dc9157dfa3cc2b68660c6 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 07:12:30 +0000
Subject: [PATCH 46/65] more docstring udpates

---
 .../configuration_perception_lm.py            |  8 +--
 .../perception_lm/modeling_perception_lm.py   | 62 +++++++++----------
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index fac19af103e6..d2643174cb13 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -35,19 +35,19 @@ class PerceptionEncoderConfig(PretrainedConfig):
             Whether to use a CLS token.
         architecture (`str`, *optional*, defaults to `"vit_pe_core_large_patch14_336"`):
             The architecture of the model.
-        width (`int`, *optional*, defaults to `1024`):
+        width (`int`, *optional*, defaults to 1024):
             The width (hidden size) of the model.
         img_size (`List[int]`, *optional*, defaults to `[448, 448]`):
             The size of the input image as [height, width].
-        depth (`int`, *optional*, defaults to `23`):
+        depth (`int`, *optional*, defaults to 23):
             The number of layers in the model.
-        num_classes (`int`, *optional*, defaults to `0`):
+        num_classes (`int`, *optional*, defaults to 0):
             The number of classes for classification.
         global_pool (`str`, *optional*, defaults to `""`):
             The global pooling strategy.
         use_post_transformer_norm (`bool`, *optional*, defaults to `False`):
             Whether to use post-transformer normalization.
-        init_values (`float`, *optional*, defaults to `0.1`):
+        init_values (`float`, *optional*, defaults to 0.1):
             The initialization value for LayerScale.
         ref_feat_shape (`List[int]`, *optional*, defaults to `[32, 32]`):
             The shape of the reference feature as [height, width].
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index b0bace060594..c517d82b7f24 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -238,37 +238,37 @@ def forward(
         **lm_kwargs,
     ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
-            Args:
-                input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                    Indices of input sequence tokens in the vocabulary.
-                pixel_values (`torch.FloatTensor` of shape `(batch_size, num_tiles, channels, height, width)`, *optional*):
-                    Pixel values for input images.
-                pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, channels, height, width)`, *optional*):
-                    Pixel values for input videos.
-                attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                    Mask to avoid performing attention on padding token indices.
-                position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                    Indices of positions of each input sequence token in the position embeddings.
-                past_key_values (`List[torch.FloatTensor]`, *optional*):
-                    List of precomputed key and value hidden states for each layer, used for fast autoregressive generation.
-                inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                    Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
-                use_cache (`bool`, *optional*):
-                    Whether or not to use past key values to speed up decoding.
-                output_attentions (`bool`, *optional*):
-                    Whether or not to return the attentions tensors of all attention layers.
-                output_hidden_states (`bool`, *optional*):
-                    Whether or not to return the hidden states of all layers.
-                return_dict (`bool`, *optional*):
-                    Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-                cache_position (`torch.LongTensor`, *optional*):
-                    Position indices for cached key/value states, used for efficient generation.
-                logits_to_keep (`int` or `torch.Tensor`, *optional*):
-                    If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-                    `input_ids` (special case). If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the
-                    sequence length dimension.
-                lm_kwargs:
-                    Additional keyword arguments passed to the language model.
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary.
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_tiles, channels, height, width)`, *optional*):
+            Pixel values for input images.
+        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, channels, height, width)`, *optional*):
+            Pixel values for input videos.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence token in the position embeddings.
+        past_key_values (`List[torch.FloatTensor]`, *optional*):
+            List of precomputed key and value hidden states for each layer, used for fast autoregressive generation.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
+        use_cache (`bool`, *optional*):
+            Whether or not to use past key values to speed up decoding.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor`, *optional*):
+            Position indices for cached key/value states, used for efficient generation.
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the
+            sequence length dimension.
+        lm_kwargs:
+            Additional keyword arguments passed to the language model.
+
         Example:
         (TODO: fix example)
         ```python

From 2c73fc4dbd425c109dc35d3279c193d4253c2ca0 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Tue, 17 Jun 2025 07:23:14 +0000
Subject: [PATCH 47/65] add vision_input_type default fallback for image
 processing

---
 .../image_processing_perception_lm_fast.py        | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 996105f26d92..be196162797c 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -270,12 +270,15 @@ def _preprocess(
         resized_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
             if do_resize:
-                thumbnails, _ = self.resize(stacked_images, self.tile_size, max_num_tiles=1)
-                images_for_tiling, (tiles_w, tiles_h) = self.resize(
-                    stacked_images, self.tile_size, max_num_tiles=self.max_num_tiles
-                )
-                image_tiles = self._split(images_for_tiling, tiles_w, tiles_h)
-                stacked_images = torch.cat([thumbnails.unsqueeze(1), image_tiles], dim=1)
+                if self.vision_input_type == "thumb+tile":
+                    thumbnails, _ = self.resize(stacked_images, self.tile_size, max_num_tiles=1)
+                    images_for_tiling, (tiles_w, tiles_h) = self.resize(
+                        stacked_images, self.tile_size, max_num_tiles=self.max_num_tiles
+                    )
+                    image_tiles = self._split(images_for_tiling, tiles_w, tiles_h)
+                    stacked_images = torch.cat([thumbnails.unsqueeze(1), image_tiles], dim=1)
+                else: # vanilla single tile for low memory devices
+                    stacked_images, _ = self.resize(stacked_images, self.tile_size, max_num_tiles=1)
 
             resized_images_grouped[shape] = stacked_images
         resized_images = reorder_images(resized_images_grouped, grouped_images_index)

From 6ceb83a7db0f3382c4780b47eab9b610ee4498ec Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 18 Jun 2025 06:01:47 +0000
Subject: [PATCH 48/65] more verbose variable naming

---
 .../image_processing_perception_lm_fast.py    | 34 ++++++++-----------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index be196162797c..94cc460c8517 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -199,29 +199,25 @@ def _find_closest_aspect_ratio(self, img_width: int, img_height: int, tile_size:
         Given an image width, height and target number of chunks
         this function will find the closest supported aspect ratio.
         """
-        tgt_ar = img_width / img_height
+        target_aspect_ratio = img_width / img_height
         asp_dict = self._find_supported_aspect_ratios()
-        cl_p = None
-        if tgt_ar >= 1:
-            cl_p = min(
-                [k for k in asp_dict.keys() if k <= tgt_ar],
-                key=lambda x: abs(x - tgt_ar),
+        closest_aspect_ratio = None
+        if target_aspect_ratio >= 1:
+            closest_aspect_ratio = min(
+                [k for k in asp_dict.keys() if k <= target_aspect_ratio],
+                key=lambda x: abs(x - target_aspect_ratio),
             )
-            v = asp_dict[cl_p]
-            # select width
-            widths = [(idx, tile_size * vv[0]) for idx, vv in enumerate(v)]
-            tgt_idx = max(widths, key=lambda x: x[1])[0]
+            tiles_given_aspect_ratio = asp_dict[closest_aspect_ratio]
+            # select largest width
+            return max(tiles_given_aspect_ratio, key=lambda x: x[0])
         else:
-            cl_p = min(
-                [k for k in asp_dict.keys() if k > tgt_ar],
-                key=lambda x: abs(1 / x - 1 / tgt_ar),
+            closest_aspect_ratio = min(
+                [k for k in asp_dict.keys() if k > target_aspect_ratio],
+                key=lambda x: abs(1 / x - 1 / target_aspect_ratio),
             )
-            v = asp_dict[cl_p]
-            # select height
-            heights = [(idx, tile_size * vv[1]) for idx, vv in enumerate(v)]
-            tgt_idx = max(heights, key=lambda x: x[1])[0]
-        out = v[tgt_idx]
-        return out
+            tiles_given_aspect_ratio = asp_dict[closest_aspect_ratio]
+            # select largest height
+            return max(tiles_given_aspect_ratio, key=lambda x: x[1])
 
     def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
         # Split image into number of required tiles (width x height)

From 87c6ca4e39f784b4eb4b5eb461f9c00f184a7890 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 18 Jun 2025 06:36:19 +0000
Subject: [PATCH 49/65] test update

---
 .../image_processing_perception_lm_fast.py    | 24 +++++----
 .../test_modeling_perception_lm.py            | 51 +++++++++++++++++--
 2 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 94cc460c8517..2c577ea8c82f 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -237,17 +237,17 @@ def resize(
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
-        h, w = get_image_size(image, channel_dim=input_data_format)
+        height, width = get_image_size(image, channel_dim=input_data_format)
         if max_num_tiles > 1:
-            ar = self._fit_image_to_canvas(img_width=w, img_height=h, tile_size=tile_size)
-            if ar is None:
+            aspect_ratio = self._fit_image_to_canvas(img_width=width, img_height=height, tile_size=tile_size)
+            if aspect_ratio is None:
                 # If we did not find a canvas, we have to find the closest aspect ratio and downsample the image
-                ar = self._find_closest_aspect_ratio(img_width=w, img_height=h, tile_size=tile_size)
+                aspect_ratio = self._find_closest_aspect_ratio(img_width=width, img_height=height, tile_size=tile_size)
         else:
-            ar = (1, 1)
-        new_w, new_h = ar[0] * tile_size, ar[1] * tile_size
-        image = F.resize(image, (new_h, new_w), interpolation=resample)
-        return image, ar
+            aspect_ratio = (1, 1)
+        new_width, new_height = aspect_ratio[0] * tile_size, aspect_ratio[1] * tile_size
+        image = F.resize(image, (new_height, new_width), interpolation=resample)
+        return image, aspect_ratio
 
     def _preprocess(
         self,
@@ -258,6 +258,8 @@ def _preprocess(
         do_normalize: Optional[bool],
         image_mean: Optional[Union[float, list[float]]],
         image_std: Optional[Union[float, list[float]]],
+        tile_size: int,
+        max_num_tiles: int,
         return_tensors: Optional[Union[str, TensorType]],
         **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs],
     ) -> BatchFeature:
@@ -267,14 +269,14 @@ def _preprocess(
         for shape, stacked_images in grouped_images.items():
             if do_resize:
                 if self.vision_input_type == "thumb+tile":
-                    thumbnails, _ = self.resize(stacked_images, self.tile_size, max_num_tiles=1)
+                    thumbnails, _ = self.resize(stacked_images, tile_size, max_num_tiles=1)
                     images_for_tiling, (tiles_w, tiles_h) = self.resize(
-                        stacked_images, self.tile_size, max_num_tiles=self.max_num_tiles
+                        stacked_images, tile_size, max_num_tiles=max_num_tiles
                     )
                     image_tiles = self._split(images_for_tiling, tiles_w, tiles_h)
                     stacked_images = torch.cat([thumbnails.unsqueeze(1), image_tiles], dim=1)
                 else: # vanilla single tile for low memory devices
-                    stacked_images, _ = self.resize(stacked_images, self.tile_size, max_num_tiles=1)
+                    stacked_images, _ = self.resize(stacked_images, tile_size, max_num_tiles=1)
 
             resized_images_grouped[shape] = stacked_images
         resized_images = reorder_images(resized_images_grouped, grouped_images_index)
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index 256380dbf510..15e4f0d65020 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -106,6 +106,7 @@ def __init__(
         self.num_channels = 3
         self.image_size = self.vision_config["img_size"][0]
         self.num_image_tokens = (self.vision_config["img_size"][0] // 14) ** 2
+        self.num_video_tokens = (self.vision_config["img_size"][0] // 14) ** 2
         self.seq_length = seq_length + self.num_image_tokens
         self.encoder_seq_length = self.seq_length
 
@@ -148,13 +149,11 @@ def prepare_config_and_inputs_for_common(self):
         input_ids[input_ids == config.image_token_id] = self.pad_token_id
         input_ids[input_ids == config.video_token_id] = self.pad_token_id
         input_ids[:, : self.num_image_tokens] = config.image_token_id
-        # input_ids[
-        #     :, self.num_image_tokens : self.num_video_tokens + self.num_image_tokens
-        # ] = config.video_token_id
+        input_ids[:, self.num_image_tokens : self.num_video_tokens + self.num_image_tokens] = config.video_token_id
 
         inputs_dict = {
             "pixel_values": pixel_values,
-            # "pixel_values_videos": pixel_values_videos,
+            "pixel_values_videos": pixel_values_videos,
             "input_ids": input_ids,
             "attention_mask": attention_mask,
         }
@@ -202,6 +201,7 @@ def test_inputs_embeds(self):
             input_ids = inputs["input_ids"]
             del inputs["input_ids"]
             del inputs["pixel_values"]
+            del inputs["pixel_values_videos"]
 
             wte = model.get_input_embeddings()
             inputs["inputs_embeds"] = wte(input_ids)
@@ -223,6 +223,7 @@ def test_inputs_embeds_matches_input_ids(self):
             input_ids = inputs["input_ids"]
             del inputs["input_ids"]
             del inputs["pixel_values"]
+            del inputs["pixel_values_videos"]
 
             inputs_embeds = model.get_input_embeddings()(input_ids)
 
@@ -284,6 +285,48 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
         pass
 
+    @unittest.skip(
+        "ViT PE cannot be tested with meta device"
+    )
+    def test_can_be_initialized_on_meta(self):
+        pass
+
+    @unittest.skip(
+        "ViT PE cannot be tested with meta device"
+    )
+    def test_can_load_with_meta_device_context_manager(self):
+        pass
+
+    @unittest.skip(
+        "Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM"
+    )
+    def test_generate_from_inputs_embeds_0_greedy(self):
+        pass
+
+    @unittest.skip(
+        "Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM"
+    )        
+    def test_generate_from_inputs_embeds_1_beam_search(self):
+        pass
+
+    @unittest.skip(
+        "Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM"
+    )
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+
+    @unittest.skip(
+        "We don't support initializing all missing weights, only finetuning is supported."
+    )
+    def test_can_init_all_missing_weights(self):
+        pass
+
+    @unittest.skip(
+        "We don't support initializing all missing weights, only finetuning is supported."
+    )
+    def test_initialization(self):
+        pass
+
 
 TEST_MODEL_PATH = "shumingh/plm_1b_hf"
 

From d7b47d68836f70c5b482ec630aa4c8eb34fc28f7 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 25 Jun 2025 20:06:28 +0000
Subject: [PATCH 50/65] Remove PE and PEConfig use AutoModel(TimmWrapper)
 instead

---
 .../models/auto/configuration_auto.py         |   2 +-
 .../configuration_perception_lm.py            |  96 ++---------
 .../convert_perception_lm_weights_to_hf.py    |  40 +++--
 .../image_processing_perception_lm_fast.py    |   4 +-
 .../perception_lm/modeling_perception_lm.py   | 163 ++++++++----------
 .../perception_lm/modular_perception_lm.py    |  31 +---
 6 files changed, 109 insertions(+), 227 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 012d7773bc8d..7c6c7a8ec779 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -267,7 +267,7 @@
         ("pegasus", "PegasusConfig"),
         ("pegasus_x", "PegasusXConfig"),
         ("perceiver", "PerceiverConfig"),
-        ("perception_encoder", "PerceptionEncoderConfig"),
+        ("perception_encoder", "TimmWrapperConfig"),
         ("perception_lm", "PerceptionLMConfig"),
         ("persimmon", "PersimmonConfig"),
         ("phi", "PhiConfig"),
diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index d2643174cb13..13fabe2b2de3 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -13,90 +13,14 @@
 # limitations under the License.
 """PerceptionLM model configuration"""
 
-from transformers.configuration_utils import PretrainedConfig
-
+from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 from ..auto import CONFIG_MAPPING, AutoConfig
+from ..timm_wrapper.configuration_timm_wrapper import TimmWrapperConfig
 
 
 logger = logging.get_logger(__name__)
 
-
-class PerceptionEncoderConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`PerceptionEncoder`]. It is used to instantiate a
-    PerceptionEncoder model according to the specified arguments, defining the model architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        use_cls_token (`bool`, *optional*, defaults to `True`):
-            Whether to use a CLS token.
-        architecture (`str`, *optional*, defaults to `"vit_pe_core_large_patch14_336"`):
-            The architecture of the model.
-        width (`int`, *optional*, defaults to 1024):
-            The width (hidden size) of the model.
-        img_size (`List[int]`, *optional*, defaults to `[448, 448]`):
-            The size of the input image as [height, width].
-        depth (`int`, *optional*, defaults to 23):
-            The number of layers in the model.
-        num_classes (`int`, *optional*, defaults to 0):
-            The number of classes for classification.
-        global_pool (`str`, *optional*, defaults to `""`):
-            The global pooling strategy.
-        use_post_transformer_norm (`bool`, *optional*, defaults to `False`):
-            Whether to use post-transformer normalization.
-        init_values (`float`, *optional*, defaults to 0.1):
-            The initialization value for LayerScale.
-        ref_feat_shape (`List[int]`, *optional*, defaults to `[32, 32]`):
-            The shape of the reference feature as [height, width].
-
-    Example:
-
-    ```python
-    >>> from transformers import PerceptionEncoder, PerceptionEncoderConfig
-
-    >>> # Initializing a PerceptionEncoder configuration
-    >>> configuration = PerceptionEncoderConfig()
-
-    >>> # Initializing a model from the configuration
-    >>> model = PerceptionEncoder(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```
-    """
-
-    model_type = "perception_encoder"
-
-    def __init__(
-        self,
-        use_cls_token=True,
-        architecture="vit_pe_core_large_patch14_336",
-        width=1024,
-        img_size=[448, 448],
-        depth=23,
-        num_classes=0,
-        global_pool="",
-        use_post_transformer_norm=False,
-        init_values=0.1,
-        ref_feat_shape=[32, 32],
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.use_cls_token = use_cls_token
-        self.architecture = architecture
-        self.width = width
-        self.img_size = img_size
-        self.depth = depth
-        self.num_classes = num_classes
-        self.global_pool = global_pool
-        self.use_post_transformer_norm = use_post_transformer_norm
-        self.init_values = init_values
-        self.ref_feat_shape = ref_feat_shape
-
-
 class PerceptionLMConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`PerceptionLMForConditionalGeneration`]. It is used to instantiate an
@@ -109,7 +33,7 @@ class PerceptionLMConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vision_config (`Union[PerceptionEncoderConfig, dict]`, *optional*, defaults to `PerceptionEncoderConfig()`):
+        vision_config (`Union[TimmWrapperConfig, dict]`, *optional*, defaults to `TimmWrapperConfig()`):
             The config object or dictionary of the vision backbone.
         text_config (`Union[PretrainedConfig, dict]`, *optional*, defaults to `LlamaConfig()`):
             The config object or dictionary of the text backbone.
@@ -123,10 +47,10 @@ class PerceptionLMConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import PerceptionLMForConditionalGeneration, PerceptionLMConfig, PerceptionEncoderConfig, LlamaConfig
+    >>> from transformers import PerceptionLMForConditionalGeneration, PerceptionLMConfig, TimmWrapperConfig, LlamaConfig
 
     >>> # Initializing a PerceptionEncoder config
-    >>> vision_config = PerceptionEncoderConfig()
+    >>> vision_config = TimmWrapperConfig()
 
     >>> # Initializing a Llama config
     >>> text_config = LlamaConfig()
@@ -142,12 +66,13 @@ class PerceptionLMConfig(PretrainedConfig):
     ```"""
 
     model_type = "perception_lm"
-    sub_configs = {"text_config": AutoConfig, "vision_config": PerceptionEncoderConfig}
+    sub_configs = {"text_config": AutoConfig, "vision_config": TimmWrapperConfig}
 
     def __init__(
         self,
         vision_config=None,
         text_config=None,
+        vision_use_cls_token=True,
         projector_pooling_ratio=1,
         image_token_id=128002,
         video_token_id=128003,
@@ -156,12 +81,13 @@ def __init__(
         self.image_token_id = image_token_id
         self.video_token_id = video_token_id
         if isinstance(vision_config, dict):
-            vision_config = PerceptionEncoderConfig(**vision_config)
-        elif isinstance(vision_config, PerceptionEncoderConfig):
+            vision_config = TimmWrapperConfig(**vision_config)
+        elif isinstance(vision_config, TimmWrapperConfig):
             vision_config = vision_config
         elif vision_config is None:
-            vision_config = PerceptionEncoderConfig()
+            vision_config = TimmWrapperConfig()
         self.vision_config = vision_config
+        self.vision_use_cls_token = vision_use_cls_token
 
         if isinstance(text_config, dict):
             text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index f7bf26dd1dc6..91726b20c0fd 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -30,15 +30,14 @@
     PreTrainedTokenizerFast,
 )
 from transformers.convert_slow_tokenizer import TikTokenConverter
+from transformers.models.auto.modeling_auto import AutoModel
 from transformers.models.perception_lm.configuration_perception_lm import (
-    PerceptionEncoderConfig,
     PerceptionLMConfig,
 )
 from transformers.models.perception_lm.image_processing_perception_lm_fast import (
     PerceptionLMImageProcessorFast,
 )
 from transformers.models.perception_lm.modeling_perception_lm import (
-    PerceptionEncoder,
     PerceptionLMForConditionalGeneration,
 )
 from transformers.models.perception_lm.processing_perception_lm import (
@@ -47,6 +46,7 @@
 from transformers.models.perception_lm.video_processing_perception_lm import (
     PerceptionLMVideoProcessor,
 )
+from transformers.models.timm_wrapper.configuration_timm_wrapper import TimmWrapperConfig
 
 
 try:
@@ -324,24 +324,25 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
                 f"Unsupported PE config: {vision_params['layers']} layers and {vision_params['width']} width"
             )
 
-        vision_config = PerceptionEncoderConfig(
-            use_cls_token=vision_params["use_cls_token"],
-            width=vision_params["width"],
-            architecture=architecture,
-            img_size=(vision_params["image_size"], vision_params["image_size"]),
-            depth=vision_params["layers"],
-            num_classes=0,
-            global_pool="",
-            use_post_transformer_norm=vision_params["use_ln_post"],
-            init_values=vision_params["ls_init_value"],
-            ref_feat_shape=(
-                vision_params["image_size"] // vision_params["patch_size"],
-                vision_params["image_size"] // vision_params["patch_size"],
-            ),
+        vision_config = TimmWrapperConfig.from_pretrained(
+            f"timm/{architecture}.fb",
+            model_args={
+                "embed_dim": vision_params["width"],
+                "depth": vision_params["layers"],
+                "img_size": (vision_params["image_size"], vision_params["image_size"]),
+                "global_pool": "",
+                "use_post_transformer_norm": vision_params["use_ln_post"],
+                "init_values": vision_params["ls_init_value"],
+                "ref_feat_shape": (
+                    vision_params["image_size"] // vision_params["patch_size"],
+                    vision_params["image_size"] // vision_params["patch_size"],
+                ),
+            },
         )
-        perception_encoder = PerceptionEncoder(vision_config)
-        state_dict = checkpoint_filter_fn(state_dict, perception_encoder.eva_pe)
-        state_dict = {"model.vision_tower.eva_pe." + k: v for k, v in state_dict.items()}
+
+        perception_encoder = AutoModel.from_config(vision_config)
+        state_dict = checkpoint_filter_fn(state_dict, perception_encoder)
+        state_dict = {"model.vision_tower.timm_model." + k: v for k, v in state_dict.items()}
         for k, v in state_dict.items():
             index_dict["weight_map"][k] = filename
             param_count += v.numel()
@@ -388,6 +389,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         config = PerceptionLMConfig(
             text_config=text_config.to_dict(),
             vision_config=vision_config.to_dict(),
+            vision_tower_config=vision_params["use_cls_token"],
             projector_pooling_ratio=projector_pooling_ratio,
             image_token_id=image_token_id,
         )
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 2c577ea8c82f..7b3a4e412372 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -264,7 +264,7 @@ def _preprocess(
         **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs],
     ) -> BatchFeature:
         # Group images by size for batched transformation
-        grouped_images, grouped_images_index = group_images_by_shape(images)
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=False)
         resized_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
             if do_resize:
@@ -281,7 +281,7 @@ def _preprocess(
             resized_images_grouped[shape] = stacked_images
         resized_images = reorder_images(resized_images_grouped, grouped_images_index)
 
-        grouped_images, grouped_images_index = group_images_by_shape(resized_images)
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=False)
         processed_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
             # Fused rescale and normalize
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index c517d82b7f24..93002e5f15b7 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -20,9 +20,8 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 
-import timm
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -35,31 +34,7 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, can_return_tuple
 from ..auto import AutoModel
-from .configuration_perception_lm import PerceptionEncoderConfig, PerceptionLMConfig
-
-
-class PerceptionEncoder(PreTrainedModel):
-    def __init__(self, config: PerceptionEncoderConfig):
-        super().__init__(config)
-        self.use_cls_token = config.use_cls_token
-        self.eva_pe = timm.create_model(
-            config.architecture,
-            img_size=config.img_size,
-            depth=config.depth,
-            num_classes=config.num_classes,
-            global_pool=config.global_pool,
-            use_post_transformer_norm=config.use_post_transformer_norm,
-            init_values=config.init_values,
-            ref_feat_shape=config.ref_feat_shape,
-            embed_dim=config.width,
-        )
-
-    def forward(self, x):
-        x = self.eva_pe(x)
-        if self.use_cls_token:
-            return x[:, 1:, :]
-        else:
-            return x
+from .configuration_perception_lm import PerceptionLMConfig
 
 
 class AdaptiveAvgPooling(nn.Module):
@@ -84,7 +59,7 @@ def forward(self, hidden_states):
 class PerceptionLMMultiModalProjector(nn.Module):
     def __init__(self, config: PerceptionLMConfig):
         super().__init__()
-        input_size = config.vision_config.width
+        input_size = config.vision_config.model_args["embed_dim"]
         output_size = config.text_config.hidden_size
         self.projector = nn.ModuleList(
             [
@@ -144,42 +119,33 @@ def _init_weights(self, module):
 
 
 @dataclass
-class PerceptionLMCausalLMOutputWithPast(ModelOutput):
-    """
+@auto_docstring(
+    custom_intro="""
     Base class for PerceptionLM causal language model (or autoregressive) outputs.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        image_hidden_states (`torch.FloatTensor`, *optional*):
-            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
-            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+)
+class PerceptionLMCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
     """
 
     loss: Optional[torch.FloatTensor] = None
     logits: Optional[torch.FloatTensor] = None
-    past_key_values: Optional[List[torch.FloatTensor]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
     image_hidden_states: Optional[torch.FloatTensor] = None
 
 
@@ -200,6 +166,12 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.language_model.set_input_embeddings(value)
 
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
@@ -215,6 +187,9 @@ def get_image_features(
             image_features (`torch.Tensor`): Image feature tensor of shape `(num_tiles, num_patches, embed_dim)`).
         """
         image_outputs = self.vision_tower(pixel_values.flatten(0, 1))
+        image_outputs = image_outputs.last_hidden_state
+        if self.config.vision_use_cls_token:
+            image_outputs = image_outputs[:, 1:, :]
         image_features = self.multi_modal_projector(image_outputs)
         return image_features
 
@@ -227,7 +202,7 @@ def forward(
         pixel_values_videos: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -236,39 +211,39 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
-    ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
+    ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of input sequence tokens in the vocabulary.
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_tiles, channels, height, width)`, *optional*):
-            Pixel values for input images.
-        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, channels, height, width)`, *optional*):
-            Pixel values for input videos.
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence token in the position embeddings.
-        past_key_values (`List[torch.FloatTensor]`, *optional*):
-            List of precomputed key and value hidden states for each layer, used for fast autoregressive generation.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
-        use_cache (`bool`, *optional*):
-            Whether or not to use past key values to speed up decoding.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        cache_position (`torch.LongTensor`, *optional*):
-            Position indices for cached key/value states, used for efficient generation.
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the
-            sequence length dimension.
-        lm_kwargs:
-            Additional keyword arguments passed to the language model.
-
+            Args:
+                input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                    Indices of input sequence tokens in the vocabulary.
+                pixel_values (`torch.FloatTensor` of shape `(batch_size, num_tiles, channels, height, width)`, *optional*):
+                    Pixel values for input images.
+                pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, channels, height, width)`, *optional*):
+                    Pixel values for input videos.
+                attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                    Mask to avoid performing attention on padding token indices.
+                position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                    Indices of positions of each input sequence token in the position embeddings.
+                past_key_values (`List[torch.FloatTensor]`, *optional*):
+                    List of precomputed key and value hidden states for each layer, used for fast autoregressive generation.
+                inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                    Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
+                use_cache (`bool`, *optional*):
+                    Whether or not to use past key values to speed up decoding.
+                output_attentions (`bool`, *optional*):
+                    Whether or not to return the attentions tensors of all attention layers.
+                output_hidden_states (`bool`, *optional*):
+                    Whether or not to return the hidden states of all layers.
+                return_dict (`bool`, *optional*):
+                    Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+                cache_position (`torch.LongTensor`, *optional*):
+                    Position indices for cached key/value states, used for efficient generation.
+                logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                    If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                    `input_ids` (special case). If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the
+                    sequence length dimension.
+                lm_kwargs:
+                    Additional keyword arguments passed to the language model.
         Example:
         (TODO: fix example)
         ```python
@@ -414,7 +389,7 @@ def forward(
         pixel_values_videos: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -424,7 +399,7 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
-    ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
+    ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index e98a773acb61..95ebe33654fb 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -38,7 +38,7 @@
     LlavaModel,
     LlavaPreTrainedModel,
 )
-from .configuration_perception_lm import PerceptionEncoderConfig, PerceptionLMConfig
+from .configuration_perception_lm import PerceptionLMConfig
 
 
 logger = logging.get_logger(__name__)
@@ -49,30 +49,6 @@
 _CHECKPOINT_FOR_DOC = "facebook/Perception-LM-1B"
 
 
-class PerceptionEncoder(PreTrainedModel):
-    def __init__(self, config: PerceptionEncoderConfig):
-        super().__init__(config)
-        self.use_cls_token = config.use_cls_token
-        self.eva_pe = timm.create_model(
-            config.architecture,
-            img_size=config.img_size,
-            depth=config.depth,
-            num_classes=config.num_classes,
-            global_pool=config.global_pool,
-            use_post_transformer_norm=config.use_post_transformer_norm,
-            init_values=config.init_values,
-            ref_feat_shape=config.ref_feat_shape,
-            embed_dim=config.width,
-        )
-
-    def forward(self, x):
-        x = self.eva_pe(x)
-        if self.use_cls_token:
-            return x[:, 1:, :]
-        else:
-            return x
-
-
 class AdaptiveAvgPooling(nn.Module):
     def __init__(self, pooling_ratio=2):
         super(AdaptiveAvgPooling, self).__init__()
@@ -95,7 +71,7 @@ def forward(self, hidden_states):
 class PerceptionLMMultiModalProjector(nn.Module):
     def __init__(self, config: PerceptionLMConfig):
         super().__init__()
-        input_size = config.vision_config.width
+        input_size = config.vision_config.model_args["embed_dim"]
         output_size = config.text_config.hidden_size
         self.projector = nn.ModuleList(
             [
@@ -153,6 +129,9 @@ def get_image_features(
             image_features (`torch.Tensor`): Image feature tensor of shape `(num_tiles, num_patches, embed_dim)`).
         """
         image_outputs = self.vision_tower(pixel_values.flatten(0, 1))
+        image_outputs = image_outputs.last_hidden_state
+        if self.config.vision_use_cls_token:
+            image_outputs = image_outputs[:, 1:, :]
         image_features = self.multi_modal_projector(image_outputs)
         return image_features
 

From cbc10576aa9a3be34a64b3832c588d77f98285c6 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 25 Jun 2025 20:15:51 +0000
Subject: [PATCH 51/65] Minor cleanup.

---
 docs/source/en/model_doc/perception_lm.md             | 3 ---
 src/transformers/models/auto/image_processing_auto.py | 2 --
 2 files changed, 5 deletions(-)

diff --git a/docs/source/en/model_doc/perception_lm.md b/docs/source/en/model_doc/perception_lm.md
index 14de19791f3d..172bd68199a0 100644
--- a/docs/source/en/model_doc/perception_lm.md
+++ b/docs/source/en/model_doc/perception_lm.md
@@ -37,9 +37,6 @@ video captions. Additionally, we introduce PLM–VideoBench, a suite for evaluat
 understanding tasks focusing on the ability to reason about “what”, “where”, “when”, and “how” of a
 video. We make our work fully reproducible by providing data, training recipes, code & models.*
 
-Tips:
-
-<INSERT TIPS ABOUT MODEL HERE>
 
 This model was contributed by [shumingh](https://huggingface.co/shumingh).
 The original code can be found [here](https://github.com/facebookresearch/perception_models).
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index e730eb179841..4765da007888 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -598,8 +598,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                     raise ValueError(
                         "This image processor cannot be instantiated. Please make sure you have `Pillow` installed."
                     )
-        print("config type", type(config))
-        print("config", config)
         raise ValueError(
             f"Unrecognized image processor in {pretrained_model_name_or_path}. Should have a "
             f"`image_processor_type` key in its {IMAGE_PROCESSOR_NAME} of {CONFIG_NAME}, or one of the following "

From d62b35d898240bb2a2f05e8eb95079b06a655962 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 25 Jun 2025 20:21:42 +0000
Subject: [PATCH 52/65] Minor Fix: remove any ref to PE. Ruff format and check.

---
 .../configuration_perception_lm.py            |  3 +-
 .../convert_perception_lm_weights_to_hf.py    |  3 +-
 .../image_processing_perception_lm_fast.py    | 10 +++----
 .../perception_lm/modeling_perception_lm.py   |  2 +-
 .../perception_lm/modular_perception_lm.py    | 13 ++++-----
 .../perception_lm/processing_perception_lm.py |  4 +--
 .../test_modeling_perception_lm.py            | 28 +++++--------------
 7 files changed, 23 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index 13fabe2b2de3..f74337d3a693 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -21,6 +21,7 @@
 
 logger = logging.get_logger(__name__)
 
+
 class PerceptionLMConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`PerceptionLMForConditionalGeneration`]. It is used to instantiate an
@@ -100,4 +101,4 @@ def __init__(
         super().__init__(**kwargs)
 
 
-__all__ = ["PerceptionLMConfig", "PerceptionEncoderConfig"]
+__all__ = ["PerceptionLMConfig"]
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 91726b20c0fd..c241f6a3beb9 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -17,7 +17,6 @@
 import os
 import tempfile
 import warnings
-from typing import List
 
 import torch
 from timm.models.eva import checkpoint_filter_fn
@@ -581,7 +580,7 @@ def main():
     parser.add_argument(
         "--special_tokens",
         default=None,
-        type=List[str],
+        type=list[str],
         help="The list of special tokens that should be added to the model.",
     )
     args = parser.parse_args()
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 7b3a4e412372..9e669df08fe1 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -14,7 +14,7 @@
 
 import math
 from functools import reduce
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import numpy as np
 
@@ -115,7 +115,7 @@ def _find_supported_aspect_ratios(self):
 
     def _get_image_height_width(
         self, image_width: int, image_height: int, target_width: int, target_height: int
-    ) -> Tuple[int, int]:
+    ) -> tuple[int, int]:
         """
         Given image width, height and target width, height for the canvas, return the dimensions of how the image would be resized
         with aspect ratio preservation.
@@ -194,7 +194,7 @@ def _fit_image_to_canvas(self, img_width: int, img_height: int, tile_size: int):
                         optimal_image_width_height = image_width_height
         return optimal_canvas
 
-    def _find_closest_aspect_ratio(self, img_width: int, img_height: int, tile_size: int) -> Tuple:
+    def _find_closest_aspect_ratio(self, img_width: int, img_height: int, tile_size: int) -> tuple:
         """
         Given an image width, height and target number of chunks
         this function will find the closest supported aspect ratio.
@@ -251,7 +251,7 @@ def resize(
 
     def _preprocess(
         self,
-        images: List["torch.Tensor"],
+        images: list["torch.Tensor"],
         do_resize: bool,
         do_rescale: Optional[bool],
         rescale_factor: Optional[Union[int, float]],
@@ -275,7 +275,7 @@ def _preprocess(
                     )
                     image_tiles = self._split(images_for_tiling, tiles_w, tiles_h)
                     stacked_images = torch.cat([thumbnails.unsqueeze(1), image_tiles], dim=1)
-                else: # vanilla single tile for low memory devices
+                else:  # vanilla single tile for low memory devices
                     stacked_images, _ = self.resize(stacked_images, tile_size, max_num_tiles=1)
 
             resized_images_grouped[shape] = stacked_images
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 93002e5f15b7..0252fcc2f0c3 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -470,4 +470,4 @@ def forward(
         )
 
 
-__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel", "PerceptionEncoder"]
+__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel"]
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 95ebe33654fb..301b932e4b02 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -14,9 +14,8 @@
 """PyTorch PerceptionLM model."""
 
 import math
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 
-import timm
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
@@ -25,7 +24,6 @@
 from transformers.generation.utils import GenerationMixin
 
 # from ...generation import GenerationMixin
-from ...modeling_utils import PreTrainedModel
 from ...utils import (
     auto_docstring,
     logging,
@@ -150,7 +148,7 @@ def forward(
         pixel_values_videos: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -159,7 +157,7 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
-    ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
+    ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
             Args:
                 input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -329,7 +327,7 @@ def forward(
         pixel_values_videos: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -339,7 +337,7 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
-    ) -> Union[Tuple, PerceptionLMCausalLMOutputWithPast]:
+    ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -413,5 +411,4 @@ def forward(
 __all__ = [
     "PerceptionLMForConditionalGeneration",
     "PerceptionLMPreTrainedModel",
-    "PerceptionEncoder",
 ]
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index d06507af2c93..351e4eb4cdea 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -16,7 +16,7 @@
 Processor class for PerceptionLM.
 """
 
-from typing import Iterable, List, Union
+from typing import Iterable, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, get_image_size, to_numpy_array
@@ -85,7 +85,7 @@ def __init__(
     def __call__(
         self,
         images: ImageInput = None,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         audio=None,
         videos: VideoInput = None,
         **kwargs: Unpack[PerceptionLMProcessorKwargs],
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index 15e4f0d65020..fc3468307451 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -285,45 +285,31 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
         pass
 
-    @unittest.skip(
-        "ViT PE cannot be tested with meta device"
-    )
+    @unittest.skip("ViT PE cannot be tested with meta device")
     def test_can_be_initialized_on_meta(self):
         pass
 
-    @unittest.skip(
-        "ViT PE cannot be tested with meta device"
-    )
+    @unittest.skip("ViT PE cannot be tested with meta device")
     def test_can_load_with_meta_device_context_manager(self):
         pass
 
-    @unittest.skip(
-        "Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM"
-    )
+    @unittest.skip("Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM")
     def test_generate_from_inputs_embeds_0_greedy(self):
         pass
 
-    @unittest.skip(
-        "Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM"
-    )        
+    @unittest.skip("Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM")
     def test_generate_from_inputs_embeds_1_beam_search(self):
         pass
 
-    @unittest.skip(
-        "Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM"
-    )
+    @unittest.skip("Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM")
     def test_generate_from_inputs_embeds_with_static_cache(self):
         pass
 
-    @unittest.skip(
-        "We don't support initializing all missing weights, only finetuning is supported."
-    )
+    @unittest.skip("We don't support initializing all missing weights, only finetuning is supported.")
     def test_can_init_all_missing_weights(self):
         pass
 
-    @unittest.skip(
-        "We don't support initializing all missing weights, only finetuning is supported."
-    )
+    @unittest.skip("We don't support initializing all missing weights, only finetuning is supported.")
     def test_initialization(self):
         pass
 

From f6d095ae8f7157520da2bc54b3e602fc7fbc20d5 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 25 Jun 2025 20:30:09 +0000
Subject: [PATCH 53/65] fix docstring

---
 .../configuration_perception_lm.py            |  7 +--
 .../perception_lm/modeling_perception_lm.py   | 60 +++++++++----------
 2 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index f74337d3a693..a6df856ae149 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -25,10 +25,7 @@
 class PerceptionLMConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`PerceptionLMForConditionalGeneration`]. It is used to instantiate an
-    PerceptionLM model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the PerceptionLM-9B.
-
-    e.g. [perception_lm-hf/perception_lm-9b](https://huggingface.co/perception_lm-hf/perception_lm-9b)
+    PerceptionLM model according to the specified arguments, defining the model architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -38,6 +35,8 @@ class PerceptionLMConfig(PretrainedConfig):
             The config object or dictionary of the vision backbone.
         text_config (`Union[PretrainedConfig, dict]`, *optional*, defaults to `LlamaConfig()`):
             The config object or dictionary of the text backbone.
+        vision_use_cls_token (`bool`, *optional*, defaults to `True`):
+            Whether CLS token is used in the vision backbone. If used, we remove CLS token embedding from vision output.
         projector_pooling_ratio (`int`, *optional*, defaults to 1):
             The pooling ratio used in the multimodal projector.
         image_token_id (`int`, *optional*, defaults to 128002):
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 0252fcc2f0c3..0709cd79772f 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -213,37 +213,35 @@ def forward(
         **lm_kwargs,
     ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
-            Args:
-                input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                    Indices of input sequence tokens in the vocabulary.
-                pixel_values (`torch.FloatTensor` of shape `(batch_size, num_tiles, channels, height, width)`, *optional*):
-                    Pixel values for input images.
-                pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, channels, height, width)`, *optional*):
-                    Pixel values for input videos.
-                attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                    Mask to avoid performing attention on padding token indices.
-                position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                    Indices of positions of each input sequence token in the position embeddings.
-                past_key_values (`List[torch.FloatTensor]`, *optional*):
-                    List of precomputed key and value hidden states for each layer, used for fast autoregressive generation.
-                inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                    Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
-                use_cache (`bool`, *optional*):
-                    Whether or not to use past key values to speed up decoding.
-                output_attentions (`bool`, *optional*):
-                    Whether or not to return the attentions tensors of all attention layers.
-                output_hidden_states (`bool`, *optional*):
-                    Whether or not to return the hidden states of all layers.
-                return_dict (`bool`, *optional*):
-                    Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-                cache_position (`torch.LongTensor`, *optional*):
-                    Position indices for cached key/value states, used for efficient generation.
-                logits_to_keep (`int` or `torch.Tensor`, *optional*):
-                    If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-                    `input_ids` (special case). If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the
-                    sequence length dimension.
-                lm_kwargs:
-                    Additional keyword arguments passed to the language model.
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary.
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_tiles, channels, height, width)`, *optional*):
+            Pixel values for input images.
+        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, channels, height, width)`, *optional*):
+            Pixel values for input videos.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence token in the position embeddings.
+        past_key_values (`List[torch.FloatTensor]`, *optional*):
+            List of precomputed key and value hidden states for each layer, used for fast autoregressive generation.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
+        use_cache (`bool`, *optional*):
+            Whether or not to use past key values to speed up decoding.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers.
+        cache_position (`torch.LongTensor`, *optional*):
+            Position indices for cached key/value states, used for efficient generation.
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the
+            sequence length dimension.
+        lm_kwargs:
+            Additional keyword arguments passed to the language model.
+
         Example:
         (TODO: fix example)
         ```python

From 508117d7c7521b107fff22b3f58a50ef3b49d3ec Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 25 Jun 2025 21:14:46 +0000
Subject: [PATCH 54/65] Fix modular/model consistency.Improvex docstringfor  .

---
 .../image_processing_perception_lm_fast.py    |  3 +-
 .../perception_lm/modeling_perception_lm.py   |  2 +-
 .../perception_lm/modular_perception_lm.py    | 60 +++++++++----------
 .../models/t5gemma/modeling_t5gemma.py        | 11 +++-
 4 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 9e669df08fe1..83b670cdbffe 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -86,7 +86,8 @@ def _factors(n: int):
     def _find_supported_aspect_ratios(self):
         """
         This function computes all the allowed aspect ratios for a fixed
-        number of input chunks.
+        number of input chunks. The order of returned items matters for the result of `_fit_image_to_canvas` function.
+        If tie exists in `_fit_image_to_canvas`, the latter in `_find_supported_aspect_ratios` wins.
 
         For example, with `num_tiles=5`, it will return:
         {
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 0709cd79772f..fede197af670 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -29,9 +29,9 @@
 from transformers.generation.utils import GenerationMixin
 
 from ...modeling_outputs import ModelOutput
+from ...modeling_utils import PreTrainedModel
 
 # from ...generation import GenerationMixin
-from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, can_return_tuple
 from ..auto import AutoModel
 from .configuration_perception_lm import PerceptionLMConfig
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 301b932e4b02..ed342f23a50c 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -159,37 +159,35 @@ def forward(
         **lm_kwargs,
     ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
         r"""
-            Args:
-                input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                    Indices of input sequence tokens in the vocabulary.
-                pixel_values (`torch.FloatTensor` of shape `(batch_size, num_tiles, channels, height, width)`, *optional*):
-                    Pixel values for input images.
-                pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, channels, height, width)`, *optional*):
-                    Pixel values for input videos.
-                attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                    Mask to avoid performing attention on padding token indices.
-                position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                    Indices of positions of each input sequence token in the position embeddings.
-                past_key_values (`List[torch.FloatTensor]`, *optional*):
-                    List of precomputed key and value hidden states for each layer, used for fast autoregressive generation.
-                inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                    Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
-                use_cache (`bool`, *optional*):
-                    Whether or not to use past key values to speed up decoding.
-                output_attentions (`bool`, *optional*):
-                    Whether or not to return the attentions tensors of all attention layers.
-                output_hidden_states (`bool`, *optional*):
-                    Whether or not to return the hidden states of all layers.
-                return_dict (`bool`, *optional*):
-                    Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-                cache_position (`torch.LongTensor`, *optional*):
-                    Position indices for cached key/value states, used for efficient generation.
-                logits_to_keep (`int` or `torch.Tensor`, *optional*):
-                    If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-                    `input_ids` (special case). If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the
-                    sequence length dimension.
-                lm_kwargs:
-                    Additional keyword arguments passed to the language model.
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary.
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_tiles, channels, height, width)`, *optional*):
+            Pixel values for input images.
+        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, channels, height, width)`, *optional*):
+            Pixel values for input videos.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence token in the position embeddings.
+        past_key_values (`List[torch.FloatTensor]`, *optional*):
+            List of precomputed key and value hidden states for each layer, used for fast autoregressive generation.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
+        use_cache (`bool`, *optional*):
+            Whether or not to use past key values to speed up decoding.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers.
+        cache_position (`torch.LongTensor`, *optional*):
+            Position indices for cached key/value states, used for efficient generation.
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the
+            sequence length dimension.
+        lm_kwargs:
+            Additional keyword arguments passed to the language model.
+
         Example:
         (TODO: fix example)
         ```python
diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py
index feccf6d7d9fd..b7395cc548bc 100644
--- a/src/transformers/models/t5gemma/modeling_t5gemma.py
+++ b/src/transformers/models/t5gemma/modeling_t5gemma.py
@@ -1082,6 +1082,10 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
     ) -> BaseModelOutput:
+        r"""
+        **flash_attn_kwargs: flash attention related parameters.
+        """
+
         encoder_outputs = self.encoder(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1156,6 +1160,7 @@ def forward(
         decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
             Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
             config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
             config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1227,7 +1232,7 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
 @auto_docstring
 class T5GemmaForSequenceClassification(T5GemmaPreTrainedModel):
     def __init__(self, config: T5GemmaConfig, is_encoder_decoder: Optional[bool] = None):
-        r"""
+        """
         is_encoder_decoder (`Optional`, *optional*):
             Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
         """
@@ -1279,6 +1284,7 @@ def forward(
         decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
             Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
             config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@@ -1374,7 +1380,7 @@ def forward(
 @auto_docstring
 class T5GemmaForTokenClassification(T5GemmaPreTrainedModel):
     def __init__(self, config: T5GemmaConfig, is_encoder_decoder: Optional[bool] = None):
-        r"""
+        """
         is_encoder_decoder (`Optional`, *optional*):
             Whether use encoder_decoder for token classification. When set to False, only encoder is used.
         """
@@ -1427,6 +1433,7 @@ def forward(
         decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
             Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
             config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

From bcedcc059692785662f52cb525a9a8dee68f42f5 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Thu, 26 Jun 2025 05:15:11 +0000
Subject: [PATCH 55/65] Fix PerceptionLMForConditionalGenerationModelTest

---
 src/transformers/models/__init__.py           |   1 +
 .../configuration_perception_lm.py            |   1 +
 .../convert_perception_lm_weights_to_hf.py    |   8 +-
 .../perception_lm/modeling_perception_lm.py   |  67 ++++++--
 .../perception_lm/modular_perception_lm.py    | 116 ++++++++++---
 .../test_modeling_perception_lm.py            | 152 +++++++++++++-----
 6 files changed, 269 insertions(+), 76 deletions(-)

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 7b2332d89f40..b786170ef205 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -234,6 +234,7 @@
     from .pegasus import *
     from .pegasus_x import *
     from .perceiver import *
+    from .perception_lm import *
     from .persimmon import *
     from .phi import *
     from .phi3 import *
diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index a6df856ae149..7c4634dfacff 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -86,6 +86,7 @@ def __init__(
             vision_config = vision_config
         elif vision_config is None:
             vision_config = TimmWrapperConfig()
+        vision_config._attn_implementation_autoset = True
         self.vision_config = vision_config
         self.vision_use_cls_token = vision_use_cls_token
 
diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index c241f6a3beb9..3299a26a42e7 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -17,6 +17,7 @@
 import os
 import tempfile
 import warnings
+from typing import List
 
 import torch
 from timm.models.eva import checkpoint_filter_fn
@@ -388,9 +389,10 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         config = PerceptionLMConfig(
             text_config=text_config.to_dict(),
             vision_config=vision_config.to_dict(),
-            vision_tower_config=vision_params["use_cls_token"],
             projector_pooling_ratio=projector_pooling_ratio,
-            image_token_id=image_token_id,
+            vision_use_cls_token=vision_params["use_cls_token"],
+            image_token_id=tokenizer.image_token_id,
+            video_token_id=tokenizer.video_token_id,
         )
 
         config.save_pretrained(tmp_model_path)
@@ -580,7 +582,7 @@ def main():
     parser.add_argument(
         "--special_tokens",
         default=None,
-        type=list[str],
+        type=List[str],
         help="The list of special tokens that should be added to the model.",
     )
     args = parser.parse_args()
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index fede197af670..f29b42b34a77 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -28,10 +28,8 @@
 
 from transformers.generation.utils import GenerationMixin
 
-from ...modeling_outputs import ModelOutput
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
 from ...modeling_utils import PreTrainedModel
-
-# from ...generation import GenerationMixin
 from ...utils import auto_docstring, can_return_tuple
 from ..auto import AutoModel
 from .configuration_perception_lm import PerceptionLMConfig
@@ -118,6 +116,29 @@ def _init_weights(self, module):
             module.bias.data.zero_()
 
 
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for PerceptionLM outputs, with hidden states and attentions.
+    """
+)
+class PerceptionLMModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+    video_hidden_states: Optional[torch.FloatTensor] = None
+
+
 @dataclass
 @auto_docstring(
     custom_intro="""
@@ -147,6 +168,7 @@ class PerceptionLMCausalLMOutputWithPast(ModelOutput):
     hidden_states: Optional[tuple[torch.FloatTensor]] = None
     attentions: Optional[tuple[torch.FloatTensor]] = None
     image_hidden_states: Optional[torch.FloatTensor] = None
+    video_hidden_states: Optional[torch.FloatTensor] = None
 
 
 @auto_docstring
@@ -211,7 +233,7 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
-    ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
+    ) -> Union[tuple, PerceptionLMModelOutputWithPast]:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of input sequence tokens in the vocabulary.
@@ -290,6 +312,7 @@ def forward(
             image_features = image_features.to(inputs_embeds)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
+        video_features = None
         if pixel_values_videos is not None:
             video_features = self.get_image_features(
                 pixel_values=pixel_values_videos.to(inputs_embeds),
@@ -308,12 +331,19 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
             **lm_kwargs,
         )
-        return outputs, image_features
+        return PerceptionLMModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            hidden_states=outputs.hidden_states,
+            past_key_values=outputs.past_key_values,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+            video_hidden_states=(video_features if pixel_values_videos is not None else None),
+        )
 
     def check_mask_feature_size_match(self, media_mask, media_features):
         media_token_count = media_mask.sum()
@@ -334,6 +364,22 @@ def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.post_init()
 
+    # Make modules available throught conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    def set_input_embeddings(self, new_embeddings):
+        self.model.set_input_embeddings(new_embeddings)
+
     def get_input_embeddings(self):
         return self.model.get_input_embeddings()
 
@@ -380,6 +426,8 @@ def prepare_inputs_for_generation(
             model_inputs["pixel_values_videos"] = pixel_values_videos
         return model_inputs
 
+    @can_return_tuple
+    @auto_docstring
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -427,7 +475,7 @@ def forward(
         "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
         ```"""
 
-        outputs, image_features = self.model(
+        outputs = self.model(
             input_ids=input_ids,
             pixel_values=pixel_values,
             pixel_values_videos=pixel_values_videos,
@@ -464,8 +512,9 @@ def forward(
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-            image_hidden_states=image_features if pixel_values is not None else None,
+            image_hidden_states=outputs.image_hidden_states,
+            video_hidden_states=outputs.video_hidden_states,
         )
 
 
-__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel"]
+__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel", "PerceptionLMModel"]
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index ed342f23a50c..55d2a877c8f2 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -23,18 +23,17 @@
 
 from transformers.generation.utils import GenerationMixin
 
-# from ...generation import GenerationMixin
 from ...utils import (
     auto_docstring,
+    can_return_tuple,
     logging,
 )
 from ..auto import AutoModel
-from ..llava.modeling_llava import (
-    LlavaCausalLMOutputWithPast as PerceptionLMCausalLMOutputWithPast,
-)
 from ..llava.modeling_llava import (
     LlavaModel,
     LlavaPreTrainedModel,
+    LlavaModelOutputWithPast,
+    LlavaCausalLMOutputWithPast,
 )
 from .configuration_perception_lm import PerceptionLMConfig
 
@@ -56,7 +55,9 @@ def forward(self, hidden_states):
         b, num_tokens, c = hidden_states.shape
         h = int(math.sqrt(num_tokens))
         if h * h != num_tokens:
-            raise ValueError(f"num_tokens {num_tokens} is expected to be a square number")
+            raise ValueError(
+                f"num_tokens {num_tokens} is expected to be a square number"
+            )
 
         shape = (h // self.pooling_ratio, h // self.pooling_ratio)
         hidden_states = hidden_states.permute(0, 2, 1).reshape(b, -1, h, h)
@@ -87,7 +88,9 @@ def __init__(self, config: PerceptionLMConfig):
             ]
         )
         self.pooling = (
-            AdaptiveAvgPooling(config.projector_pooling_ratio) if config.projector_pooling_ratio > 1 else nn.Identity()
+            AdaptiveAvgPooling(config.projector_pooling_ratio)
+            if config.projector_pooling_ratio > 1
+            else nn.Identity()
         )
 
     def forward(self, features):
@@ -102,6 +105,13 @@ def forward(self, features):
 class PerceptionLMPreTrainedModel(LlavaPreTrainedModel):
     base_model_prefix = ""
 
+class PerceptionLMModelOutputWithPast(LlavaModelOutputWithPast):
+    video_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class PerceptionLMCausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
+    video_hidden_states: Optional[torch.FloatTensor] = None
+
 
 @auto_docstring
 class PerceptionLMModel(LlavaModel):
@@ -141,6 +151,8 @@ def check_mask_feature_size_match(self, media_mask, media_features):
                 f"The number of tokens in the media mask ({media_token_count}) does not match the number of features in the media features ({media_feature_size}. Features shape: {media_features.shape})"
             )
 
+    @can_return_tuple
+    @auto_docstring
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -157,7 +169,7 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
-    ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
+    ) -> Union[tuple, PerceptionLMModelOutputWithPast]:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of input sequence tokens in the vocabulary.
@@ -210,14 +222,26 @@ def forward(
         "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
         ```"""
 
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-        if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+        if (
+            pixel_values is not None or pixel_values_videos is not None
+        ) and inputs_embeds is not None:
             raise ValueError(
                 "You cannot specify both (pixel_values or pixel_values_videos) and inputs_embeds at the same time, and must specify either one"
             )
@@ -232,19 +256,28 @@ def forward(
             )
             special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
             self.check_mask_feature_size_match(special_image_mask, image_features)
-            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
+                inputs_embeds.device
+            )
             image_features = image_features.to(inputs_embeds)
-            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+            inputs_embeds = inputs_embeds.masked_scatter(
+                special_image_mask, image_features
+            )
 
+        video_features = None
         if pixel_values_videos is not None:
             video_features = self.get_image_features(
                 pixel_values=pixel_values_videos.to(inputs_embeds),
             )
             special_video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1)
             self.check_mask_feature_size_match(special_video_mask, video_features)
-            special_video_mask = special_video_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            special_video_mask = special_video_mask.expand_as(inputs_embeds).to(
+                inputs_embeds.device
+            )
             video_features = video_features.to(inputs_embeds)
-            inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
+            inputs_embeds = inputs_embeds.masked_scatter(
+                special_video_mask, video_features
+            )
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -254,24 +287,53 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
             **lm_kwargs,
         )
-        return outputs, image_features
+        return PerceptionLMModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            hidden_states=outputs.hidden_states,
+            past_key_values=outputs.past_key_values,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+            video_hidden_states=(
+                video_features if pixel_values_videos is not None else None
+            ),
+        )
 
 
 @auto_docstring
-class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin):
+class PerceptionLMForConditionalGeneration(
+    PerceptionLMPreTrainedModel, GenerationMixin
+):
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         super().__init__(config, **super_kwargs)
         self.model = PerceptionLMModel(config)
-        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.lm_head = nn.Linear(
+            config.text_config.hidden_size, config.text_config.vocab_size, bias=False
+        )
         self.post_init()
 
+    # Make modules available throught conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    def set_input_embeddings(self, new_embeddings):
+        self.model.set_input_embeddings(new_embeddings)
+
     def get_input_embeddings(self):
         return self.model.get_input_embeddings()
 
@@ -318,6 +380,8 @@ def prepare_inputs_for_generation(
             model_inputs["pixel_values_videos"] = pixel_values_videos
         return model_inputs
 
+    @can_return_tuple
+    @auto_docstring
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -365,7 +429,7 @@ def forward(
         "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
         ```"""
 
-        outputs, image_features = self.model(
+        outputs = self.model(
             input_ids=input_ids,
             pixel_values=pixel_values,
             pixel_values_videos=pixel_values_videos,
@@ -384,7 +448,11 @@ def forward(
 
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        slice_indices = (
+            slice(-logits_to_keep, None)
+            if isinstance(logits_to_keep, int)
+            else logits_to_keep
+        )
         logits = self.lm_head(hidden_states[:, slice_indices, :])
 
         loss = None
@@ -402,11 +470,13 @@ def forward(
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-            image_hidden_states=image_features if pixel_values is not None else None,
+            image_hidden_states=outputs.image_hidden_states,
+            video_hidden_states=outputs.video_hidden_states,
         )
 
 
 __all__ = [
     "PerceptionLMForConditionalGeneration",
     "PerceptionLMPreTrainedModel",
+    "PerceptionLMModel",
 ]
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index fc3468307451..689a92feb1fd 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -21,6 +21,7 @@
     AutoProcessor,
     PerceptionLMConfig,
     PerceptionLMForConditionalGeneration,
+    PerceptionLMModel,
     is_torch_available,
 )
 from transformers.testing_utils import (
@@ -74,16 +75,16 @@ def __init__(
         },
         is_training=True,
         vision_config={
-            "use_cls_token": True,
             "architecture": "vit_pe_core_large_patch14_336",
-            "width": 64,
-            "img_size": (14, 14),
-            "depth": 2,
-            "num_classes": 0,
-            "global_pool": "",
-            "use_post_transformer_norm": False,
-            "init_values": 0.1,
-            "ref_feat_shape": (1, 1),
+            "model_args": {
+                "embed_dim": 64,
+                "img_size": (14, 14),
+                "depth": 2,
+                "global_pool": "",
+                "use_post_transformer_norm": False,
+                "init_values": 0.1,
+                "ref_feat_shape": (1, 1),
+            },
         },
     ):
         self.parent = parent
@@ -104,9 +105,13 @@ def __init__(
         self.num_tiles = 1
         self.num_frames = 1
         self.num_channels = 3
-        self.image_size = self.vision_config["img_size"][0]
-        self.num_image_tokens = (self.vision_config["img_size"][0] // 14) ** 2
-        self.num_video_tokens = (self.vision_config["img_size"][0] // 14) ** 2
+        self.image_size = self.vision_config["model_args"]["img_size"][0]
+        self.num_image_tokens = (
+            self.vision_config["model_args"]["img_size"][0] // 14
+        ) ** 2
+        self.num_video_tokens = (
+            self.vision_config["model_args"]["img_size"][0] // 14
+        ) ** 2
         self.seq_length = seq_length + self.num_image_tokens
         self.encoder_seq_length = self.seq_length
 
@@ -114,6 +119,7 @@ def get_config(self):
         return PerceptionLMConfig(
             text_config=self.text_config,
             vision_config=self.vision_config,
+            vision_use_cls_token=True,
             image_token_id=self.image_token_id,
             video_token_id=self.video_token_id,
             tie_word_embeddings=self.tie_word_embeddings,
@@ -125,8 +131,8 @@ def prepare_config_and_inputs(self):
                 self.batch_size,
                 self.num_tiles,
                 self.num_channels,
-                self.vision_config["img_size"][0],
-                self.vision_config["img_size"][1],
+                self.vision_config["model_args"]["img_size"][0],
+                self.vision_config["model_args"]["img_size"][1],
             ]
         )
         pixel_values_videos = floats_tensor(
@@ -134,8 +140,8 @@ def prepare_config_and_inputs(self):
                 self.batch_size,
                 self.num_frames,
                 self.num_channels,
-                self.vision_config["img_size"][0],
-                self.vision_config["img_size"][1],
+                self.vision_config["model_args"]["img_size"][0],
+                self.vision_config["model_args"]["img_size"][1],
             ]
         )
         config = self.get_config()
@@ -144,12 +150,19 @@ def prepare_config_and_inputs(self):
 
     def prepare_config_and_inputs_for_common(self):
         config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs()
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
+        input_ids = (
+            ids_tensor(
+                [self.batch_size, self.seq_length], config.text_config.vocab_size - 2
+            )
+            + 2
+        )
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
         input_ids[input_ids == config.image_token_id] = self.pad_token_id
         input_ids[input_ids == config.video_token_id] = self.pad_token_id
         input_ids[:, : self.num_image_tokens] = config.image_token_id
-        input_ids[:, self.num_image_tokens : self.num_video_tokens + self.num_image_tokens] = config.video_token_id
+        input_ids[
+            :, self.num_image_tokens : self.num_video_tokens + self.num_image_tokens
+        ] = config.video_token_id
 
         inputs_dict = {
             "pixel_values": pixel_values,
@@ -161,12 +174,21 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class PerceptionLMForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class PerceptionLMForConditionalGenerationModelTest(
+    ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
+):
     """
     Model tester for `PerceptionLMForConditionalGeneration`.
     """
 
-    all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
+    all_model_classes = (
+        (
+            PerceptionLMModel,
+            PerceptionLMForConditionalGeneration,
+        )
+        if is_torch_available()
+        else ()
+    )
     test_pruning = False
     test_head_masking = False
     _is_composite = True
@@ -240,6 +262,8 @@ def test_mismatching_num_image_tokens(self):
         """
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
+            if model_class == PerceptionLMModel:
+                continue
             model = model_class(config).to(torch_device)
             _ = model(**input_dict)  # successful forward with no modifications
 
@@ -261,22 +285,62 @@ def test_mismatching_num_image_tokens(self):
             pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
             _ = model(input_ids=input_ids, pixel_values=pixel_values)
 
+    def test_training(self):
+        self.all_model_classes = (
+            (
+                PerceptionLMForConditionalGeneration,
+            )
+            if is_torch_available()
+            else ()
+        )
+        super().test_training()
+
+    def test_training_gradient_checkpointing(self):
+        self.all_model_classes = (
+            (
+                PerceptionLMForConditionalGeneration,
+            )
+            if is_torch_available()
+            else ()
+        )
+        super().test_training_gradient_checkpointing()
+
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        self.all_model_classes = (
+            (
+                PerceptionLMForConditionalGeneration,
+            )
+            if is_torch_available()
+            else ()
+        )
+        super().test_training_gradient_checkpointing_use_reentrant()
+
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        self.all_model_classes = (
+            (
+                PerceptionLMForConditionalGeneration,
+            )
+            if is_torch_available()
+            else ()
+        )
+        super().test_training_gradient_checkpointing_use_reentrant_false()
+
     @unittest.skip(
-        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="Timm Eva (PE) weights cannot be fully constructed in _init_weights"
     )
-    def test_training_gradient_checkpointing(self):
+    def test_can_init_all_missing_weights(self):
         pass
 
     @unittest.skip(
-        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="Timm Eva (PE) weights cannot be fully constructed in _init_weights"
     )
-    def test_training_gradient_checkpointing_use_reentrant(self):
+    def test_initialization(self):
         pass
 
     @unittest.skip(
-        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="PE/TIMM's attention implementation is self configured and won't raise ValueError on global attention implementation."
     )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
+    def test_flash_attn_2_can_dispatch_composite_models(self):
         pass
 
     @unittest.skip(
@@ -293,26 +357,24 @@ def test_can_be_initialized_on_meta(self):
     def test_can_load_with_meta_device_context_manager(self):
         pass
 
-    @unittest.skip("Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM")
+    @unittest.skip(
+        "Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM"
+    )
     def test_generate_from_inputs_embeds_0_greedy(self):
         pass
 
-    @unittest.skip("Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM")
+    @unittest.skip(
+        "Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM"
+    )
     def test_generate_from_inputs_embeds_1_beam_search(self):
         pass
 
-    @unittest.skip("Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM")
+    @unittest.skip(
+        "Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM"
+    )
     def test_generate_from_inputs_embeds_with_static_cache(self):
         pass
 
-    @unittest.skip("We don't support initializing all missing weights, only finetuning is supported.")
-    def test_can_init_all_missing_weights(self):
-        pass
-
-    @unittest.skip("We don't support initializing all missing weights, only finetuning is supported.")
-    def test_initialization(self):
-        pass
-
 
 TEST_MODEL_PATH = "shumingh/plm_1b_hf"
 
@@ -382,14 +444,18 @@ def test_small_model_integration_test(self):
         EXPECTED_DECODED_TEXT = "The bar plot displays the values of four categories: step, horror, mood, and lumber"  # fmt: skip
 
         self.assertEqual(
-            self.processor.decode(generate_ids_without_inputs[0], skip_special_tokens=True),
+            self.processor.decode(
+                generate_ids_without_inputs[0], skip_special_tokens=True
+            ),
             EXPECTED_DECODED_TEXT,
         )
 
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_batched(self):
-        model = PerceptionLMForConditionalGeneration.from_pretrained(TEST_MODEL_PATH, load_in_4bit=True)
+        model = PerceptionLMForConditionalGeneration.from_pretrained(
+            TEST_MODEL_PATH, load_in_4bit=True
+        )
         processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
         inputs = processor.apply_chat_template(
             [self.conversation1, self.conversation2],
@@ -410,7 +476,9 @@ def test_small_model_integration_test_batched(self):
         EXPECTED_DECODED_TEXT = ['The bar plot displays the values of four categories: step, horror, mood, and lumber', 'The video shows a group of people in green shirts and white shorts performing a jump rope routine']  # fmt: skip
 
         self.assertEqual(
-            processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True),
+            processor.batch_decode(
+                generate_ids_without_inputs, skip_special_tokens=True
+            ),
             EXPECTED_DECODED_TEXT,
         )
 
@@ -418,7 +486,9 @@ def test_small_model_integration_test_batched(self):
     @require_bitsandbytes
     def test_generation_no_images(self):
         # model_id = "facebook/Perception-LM-1B"
-        model = PerceptionLMForConditionalGeneration.from_pretrained(TEST_MODEL_PATH, load_in_4bit=True)
+        model = PerceptionLMForConditionalGeneration.from_pretrained(
+            TEST_MODEL_PATH, load_in_4bit=True
+        )
         processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
 
         # Prepare inputs with no images

From f25a2ca177325f731926cc1ebfb7a7592f9ddcd2 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Thu, 26 Jun 2025 05:20:21 +0000
Subject: [PATCH 56/65] ruff fix

---
 .../convert_perception_lm_weights_to_hf.py    |  3 +-
 .../perception_lm/modular_perception_lm.py    | 69 ++++----------
 .../test_modeling_perception_lm.py            | 91 ++++---------------
 3 files changed, 37 insertions(+), 126 deletions(-)

diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 3299a26a42e7..6d87b758c31d 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -17,7 +17,6 @@
 import os
 import tempfile
 import warnings
-from typing import List
 
 import torch
 from timm.models.eva import checkpoint_filter_fn
@@ -582,7 +581,7 @@ def main():
     parser.add_argument(
         "--special_tokens",
         default=None,
-        type=List[str],
+        type=list[str],
         help="The list of special tokens that should be added to the model.",
     )
     args = parser.parse_args()
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 55d2a877c8f2..41a811277ed1 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -30,10 +30,10 @@
 )
 from ..auto import AutoModel
 from ..llava.modeling_llava import (
+    LlavaCausalLMOutputWithPast,
     LlavaModel,
-    LlavaPreTrainedModel,
     LlavaModelOutputWithPast,
-    LlavaCausalLMOutputWithPast,
+    LlavaPreTrainedModel,
 )
 from .configuration_perception_lm import PerceptionLMConfig
 
@@ -55,9 +55,7 @@ def forward(self, hidden_states):
         b, num_tokens, c = hidden_states.shape
         h = int(math.sqrt(num_tokens))
         if h * h != num_tokens:
-            raise ValueError(
-                f"num_tokens {num_tokens} is expected to be a square number"
-            )
+            raise ValueError(f"num_tokens {num_tokens} is expected to be a square number")
 
         shape = (h // self.pooling_ratio, h // self.pooling_ratio)
         hidden_states = hidden_states.permute(0, 2, 1).reshape(b, -1, h, h)
@@ -88,9 +86,7 @@ def __init__(self, config: PerceptionLMConfig):
             ]
         )
         self.pooling = (
-            AdaptiveAvgPooling(config.projector_pooling_ratio)
-            if config.projector_pooling_ratio > 1
-            else nn.Identity()
+            AdaptiveAvgPooling(config.projector_pooling_ratio) if config.projector_pooling_ratio > 1 else nn.Identity()
         )
 
     def forward(self, features):
@@ -105,6 +101,7 @@ def forward(self, features):
 class PerceptionLMPreTrainedModel(LlavaPreTrainedModel):
     base_model_prefix = ""
 
+
 class PerceptionLMModelOutputWithPast(LlavaModelOutputWithPast):
     video_hidden_states: Optional[torch.FloatTensor] = None
 
@@ -222,26 +219,14 @@ def forward(
         "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
         ```"""
 
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You must specify exactly one of input_ids or inputs_embeds"
-            )
-        if (
-            pixel_values is not None or pixel_values_videos is not None
-        ) and inputs_embeds is not None:
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
                 "You cannot specify both (pixel_values or pixel_values_videos) and inputs_embeds at the same time, and must specify either one"
             )
@@ -256,13 +241,9 @@ def forward(
             )
             special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
             self.check_mask_feature_size_match(special_image_mask, image_features)
-            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
-                inputs_embeds.device
-            )
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             image_features = image_features.to(inputs_embeds)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                special_image_mask, image_features
-            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         video_features = None
         if pixel_values_videos is not None:
@@ -271,13 +252,9 @@ def forward(
             )
             special_video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1)
             self.check_mask_feature_size_match(special_video_mask, video_features)
-            special_video_mask = special_video_mask.expand_as(inputs_embeds).to(
-                inputs_embeds.device
-            )
+            special_video_mask = special_video_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             video_features = video_features.to(inputs_embeds)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                special_video_mask, video_features
-            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -298,24 +275,18 @@ def forward(
             past_key_values=outputs.past_key_values,
             attentions=outputs.attentions,
             image_hidden_states=image_features if pixel_values is not None else None,
-            video_hidden_states=(
-                video_features if pixel_values_videos is not None else None
-            ),
+            video_hidden_states=(video_features if pixel_values_videos is not None else None),
         )
 
 
 @auto_docstring
-class PerceptionLMForConditionalGeneration(
-    PerceptionLMPreTrainedModel, GenerationMixin
-):
+class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         super().__init__(config, **super_kwargs)
         self.model = PerceptionLMModel(config)
-        self.lm_head = nn.Linear(
-            config.text_config.hidden_size, config.text_config.vocab_size, bias=False
-        )
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.post_init()
 
     # Make modules available throught conditional class for BC
@@ -448,11 +419,7 @@ def forward(
 
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        slice_indices = (
-            slice(-logits_to_keep, None)
-            if isinstance(logits_to_keep, int)
-            else logits_to_keep
-        )
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
 
         loss = None
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index 689a92feb1fd..1ca67a3073b4 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -106,12 +106,8 @@ def __init__(
         self.num_frames = 1
         self.num_channels = 3
         self.image_size = self.vision_config["model_args"]["img_size"][0]
-        self.num_image_tokens = (
-            self.vision_config["model_args"]["img_size"][0] // 14
-        ) ** 2
-        self.num_video_tokens = (
-            self.vision_config["model_args"]["img_size"][0] // 14
-        ) ** 2
+        self.num_image_tokens = (self.vision_config["model_args"]["img_size"][0] // 14) ** 2
+        self.num_video_tokens = (self.vision_config["model_args"]["img_size"][0] // 14) ** 2
         self.seq_length = seq_length + self.num_image_tokens
         self.encoder_seq_length = self.seq_length
 
@@ -150,19 +146,12 @@ def prepare_config_and_inputs(self):
 
     def prepare_config_and_inputs_for_common(self):
         config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs()
-        input_ids = (
-            ids_tensor(
-                [self.batch_size, self.seq_length], config.text_config.vocab_size - 2
-            )
-            + 2
-        )
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
         input_ids[input_ids == config.image_token_id] = self.pad_token_id
         input_ids[input_ids == config.video_token_id] = self.pad_token_id
         input_ids[:, : self.num_image_tokens] = config.image_token_id
-        input_ids[
-            :, self.num_image_tokens : self.num_video_tokens + self.num_image_tokens
-        ] = config.video_token_id
+        input_ids[:, self.num_image_tokens : self.num_video_tokens + self.num_image_tokens] = config.video_token_id
 
         inputs_dict = {
             "pixel_values": pixel_values,
@@ -174,9 +163,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class PerceptionLMForConditionalGenerationModelTest(
-    ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
-):
+class PerceptionLMForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     """
     Model tester for `PerceptionLMForConditionalGeneration`.
     """
@@ -286,54 +273,26 @@ def test_mismatching_num_image_tokens(self):
             _ = model(input_ids=input_ids, pixel_values=pixel_values)
 
     def test_training(self):
-        self.all_model_classes = (
-            (
-                PerceptionLMForConditionalGeneration,
-            )
-            if is_torch_available()
-            else ()
-        )
+        self.all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
         super().test_training()
 
     def test_training_gradient_checkpointing(self):
-        self.all_model_classes = (
-            (
-                PerceptionLMForConditionalGeneration,
-            )
-            if is_torch_available()
-            else ()
-        )
+        self.all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
         super().test_training_gradient_checkpointing()
 
     def test_training_gradient_checkpointing_use_reentrant(self):
-        self.all_model_classes = (
-            (
-                PerceptionLMForConditionalGeneration,
-            )
-            if is_torch_available()
-            else ()
-        )
+        self.all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
         super().test_training_gradient_checkpointing_use_reentrant()
 
     def test_training_gradient_checkpointing_use_reentrant_false(self):
-        self.all_model_classes = (
-            (
-                PerceptionLMForConditionalGeneration,
-            )
-            if is_torch_available()
-            else ()
-        )
+        self.all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else ()
         super().test_training_gradient_checkpointing_use_reentrant_false()
 
-    @unittest.skip(
-        reason="Timm Eva (PE) weights cannot be fully constructed in _init_weights"
-    )
+    @unittest.skip(reason="Timm Eva (PE) weights cannot be fully constructed in _init_weights")
     def test_can_init_all_missing_weights(self):
         pass
 
-    @unittest.skip(
-        reason="Timm Eva (PE) weights cannot be fully constructed in _init_weights"
-    )
+    @unittest.skip(reason="Timm Eva (PE) weights cannot be fully constructed in _init_weights")
     def test_initialization(self):
         pass
 
@@ -357,21 +316,15 @@ def test_can_be_initialized_on_meta(self):
     def test_can_load_with_meta_device_context_manager(self):
         pass
 
-    @unittest.skip(
-        "Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM"
-    )
+    @unittest.skip("Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM")
     def test_generate_from_inputs_embeds_0_greedy(self):
         pass
 
-    @unittest.skip(
-        "Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM"
-    )
+    @unittest.skip("Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM")
     def test_generate_from_inputs_embeds_1_beam_search(self):
         pass
 
-    @unittest.skip(
-        "Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM"
-    )
+    @unittest.skip("Specifying both inputs_embeds and pixel_values are not supported for PerceptionLM")
     def test_generate_from_inputs_embeds_with_static_cache(self):
         pass
 
@@ -444,18 +397,14 @@ def test_small_model_integration_test(self):
         EXPECTED_DECODED_TEXT = "The bar plot displays the values of four categories: step, horror, mood, and lumber"  # fmt: skip
 
         self.assertEqual(
-            self.processor.decode(
-                generate_ids_without_inputs[0], skip_special_tokens=True
-            ),
+            self.processor.decode(generate_ids_without_inputs[0], skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,
         )
 
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_batched(self):
-        model = PerceptionLMForConditionalGeneration.from_pretrained(
-            TEST_MODEL_PATH, load_in_4bit=True
-        )
+        model = PerceptionLMForConditionalGeneration.from_pretrained(TEST_MODEL_PATH, load_in_4bit=True)
         processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
         inputs = processor.apply_chat_template(
             [self.conversation1, self.conversation2],
@@ -476,9 +425,7 @@ def test_small_model_integration_test_batched(self):
         EXPECTED_DECODED_TEXT = ['The bar plot displays the values of four categories: step, horror, mood, and lumber', 'The video shows a group of people in green shirts and white shorts performing a jump rope routine']  # fmt: skip
 
         self.assertEqual(
-            processor.batch_decode(
-                generate_ids_without_inputs, skip_special_tokens=True
-            ),
+            processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,
         )
 
@@ -486,9 +433,7 @@ def test_small_model_integration_test_batched(self):
     @require_bitsandbytes
     def test_generation_no_images(self):
         # model_id = "facebook/Perception-LM-1B"
-        model = PerceptionLMForConditionalGeneration.from_pretrained(
-            TEST_MODEL_PATH, load_in_4bit=True
-        )
+        model = PerceptionLMForConditionalGeneration.from_pretrained(TEST_MODEL_PATH, load_in_4bit=True)
         processor = AutoProcessor.from_pretrained(TEST_MODEL_PATH)
 
         # Prepare inputs with no images

From e8d08e803775eb60e5d6b9a2122e49345ca49e6d Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 27 Jun 2025 20:32:20 +0000
Subject: [PATCH 57/65] fix for check_repo

---
 docs/source/en/model_doc/perception_lm.md     | 12 ++++++++
 .../perception_lm/modeling_perception_lm.py   | 29 -------------------
 .../perception_lm/modular_perception_lm.py    | 29 -------------------
 utils/check_repo.py                           |  1 +
 4 files changed, 13 insertions(+), 58 deletions(-)

diff --git a/docs/source/en/model_doc/perception_lm.md b/docs/source/en/model_doc/perception_lm.md
index 172bd68199a0..3982d521b949 100644
--- a/docs/source/en/model_doc/perception_lm.md
+++ b/docs/source/en/model_doc/perception_lm.md
@@ -50,6 +50,18 @@ The original code can be found [here](https://github.com/facebookresearch/percep
 
 [[autodoc]] PerceptionLMProcessor
 
+## PerceptionLMImageProcessorFast
+
+[[autodoc]] PerceptionLMImageProcessorFast
+
+## PerceptionLMVideoProcessor
+
+[[autodoc]] PerceptionLMVideoProcessor
+
+## PerceptionLMModel
+
+[[autodoc]] PerceptionLMModel
+
 ## PerceptionLMForConditionalGeneration
 
 [[autodoc]] PerceptionLMForConditionalGeneration
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index f29b42b34a77..20321a303648 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -446,35 +446,6 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
-        r"""
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-            logits_to_keep (`int` or `torch.Tensor`, *optional*):
-                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-                This is useful when using packed tensor format (single dimension for batch and sequence length).
-        Returns:
-        Example:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, PerceptionLMForConditionalGeneration
-        >>> model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B")
-        >>> processor = AutoProcessor.from_pretrained("facebook/Perception-LM-1B")
-        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
-        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
-        >>> # Generate
-        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
-        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
-        ```"""
-
         outputs = self.model(
             input_ids=input_ids,
             pixel_values=pixel_values,
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 41a811277ed1..0ec2b7eb28ff 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -371,35 +371,6 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
-        r"""
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-            logits_to_keep (`int` or `torch.Tensor`, *optional*):
-                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-                This is useful when using packed tensor format (single dimension for batch and sequence length).
-        Returns:
-        Example:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, PerceptionLMForConditionalGeneration
-        >>> model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B")
-        >>> processor = AutoProcessor.from_pretrained("facebook/Perception-LM-1B")
-        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
-        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
-        >>> # Generate
-        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
-        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
-        ```"""
-
         outputs = self.model(
             input_ids=input_ids,
             pixel_values=pixel_values,
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 0487e1def262..341dde48f0d6 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -301,6 +301,7 @@
     "OwlViTForObjectDetection",
     "PatchTSMixerForPrediction",
     "PatchTSMixerForPretraining",
+    "PerceptionLMModel",
     "RagModel",
     "RagSequenceForGeneration",
     "RagTokenForGeneration",

From 4c05fb3c97eb7cc7f6a0c5be7132ad489e278841 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 27 Jun 2025 21:36:01 +0000
Subject: [PATCH 58/65] minor formatting

---
 .../perception_lm/modeling_perception_lm.py   | 23 +------------------
 .../perception_lm/modular_perception_lm.py    | 23 +------------------
 .../models/t5gemma/modeling_t5gemma.py        | 11 ++-------
 3 files changed, 4 insertions(+), 53 deletions(-)

diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 20321a303648..d313c7259c18 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -263,28 +263,7 @@ def forward(
             sequence length dimension.
         lm_kwargs:
             Additional keyword arguments passed to the language model.
-
-        Example:
-        (TODO: fix example)
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, PerceptionLMForConditionalGeneration
-
-        >>> model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B")
-        >>> processor = AutoProcessor.from_pretrained("facebook/Perception-LM-1B")
-
-        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
-        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
-        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
-        ```"""
+        """
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 0ec2b7eb28ff..6cd9f5b27842 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -196,28 +196,7 @@ def forward(
             sequence length dimension.
         lm_kwargs:
             Additional keyword arguments passed to the language model.
-
-        Example:
-        (TODO: fix example)
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, PerceptionLMForConditionalGeneration
-
-        >>> model = PerceptionLMForConditionalGeneration.from_pretrained("facebook/Perception-LM-1B")
-        >>> processor = AutoProcessor.from_pretrained("facebook/Perception-LM-1B")
-
-        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
-        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
-        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
-        ```"""
+        """
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py
index b7395cc548bc..feccf6d7d9fd 100644
--- a/src/transformers/models/t5gemma/modeling_t5gemma.py
+++ b/src/transformers/models/t5gemma/modeling_t5gemma.py
@@ -1082,10 +1082,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
     ) -> BaseModelOutput:
-        r"""
-        **flash_attn_kwargs: flash attention related parameters.
-        """
-
         encoder_outputs = self.encoder(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1160,7 +1156,6 @@ def forward(
         decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
             Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
             config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
-
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
             config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1232,7 +1227,7 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
 @auto_docstring
 class T5GemmaForSequenceClassification(T5GemmaPreTrainedModel):
     def __init__(self, config: T5GemmaConfig, is_encoder_decoder: Optional[bool] = None):
-        """
+        r"""
         is_encoder_decoder (`Optional`, *optional*):
             Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
         """
@@ -1284,7 +1279,6 @@ def forward(
         decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
             Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
             config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
-
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@@ -1380,7 +1374,7 @@ def forward(
 @auto_docstring
 class T5GemmaForTokenClassification(T5GemmaPreTrainedModel):
     def __init__(self, config: T5GemmaConfig, is_encoder_decoder: Optional[bool] = None):
-        """
+        r"""
         is_encoder_decoder (`Optional`, *optional*):
             Whether use encoder_decoder for token classification. When set to False, only encoder is used.
         """
@@ -1433,7 +1427,6 @@ def forward(
         decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
             Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
             config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
-
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

From c74d6529fa59e0df14ee2440f6de2107446c9f91 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 27 Jun 2025 21:46:22 +0000
Subject: [PATCH 59/65] dummy size arg to fix for processor test.

---
 .../models/perception_lm/image_processing_perception_lm_fast.py  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 83b670cdbffe..d3db3782eacd 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -68,6 +68,7 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_convert_rgb = True
+    size = {"width": 448, "height": 448}  # for backward compatibility in tests
     valid_kwargs = PerceptionLMFastImageProcessorKwargs
 
     def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> None:

From 3c46d3a286bb223b7ca96f983d57e507db1dc899 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Fri, 27 Jun 2025 22:05:46 +0000
Subject: [PATCH 60/65] Update docstring for PerceptionLMConfig

---
 .../models/perception_lm/configuration_perception_lm.py      | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index 7c4634dfacff..70014ec75bc3 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -27,6 +27,11 @@ class PerceptionLMConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`PerceptionLMForConditionalGeneration`]. It is used to instantiate an
     PerceptionLM model according to the specified arguments, defining the model architecture.
 
+    Example models:
+    -  [facebook/Perception-LM-1B](https://huggingface.co/facebook/Perception-LM-1B).
+    -  [facebook/Perception-LM-3B](https://huggingface.co/facebook/Perception-LM-3B).
+    -  [facebook/Perception-LM-8B](https://huggingface.co/facebook/Perception-LM-8B).
+
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 

From 160b039af70130a9b44ef12fe5f638b925f10830 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 2 Jul 2025 00:14:23 +0000
Subject: [PATCH 61/65] Minor fixes from review feedback.

---
 src/transformers/models/auto/modeling_auto.py |  1 +
 .../configuration_perception_lm.py            | 22 +------------
 .../image_processing_perception_lm_fast.py    |  5 +--
 .../perception_lm/modeling_perception_lm.py   | 31 -------------------
 .../perception_lm/modular_perception_lm.py    | 31 -------------------
 .../perception_lm/processing_perception_lm.py |  3 +-
 utils/check_repo.py                           |  1 -
 7 files changed, 6 insertions(+), 88 deletions(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 0390720947db..7c2a7d130997 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -256,6 +256,7 @@
         ("pegasus_x", "PegasusXModel"),
         ("perceiver", "PerceiverModel"),
         ("perception_encoder", "PerceptionEncoder"),
+        ("perception_lm", "PerceptionLMModel"),
         ("persimmon", "PersimmonModel"),
         ("phi", "PhiModel"),
         ("phi3", "Phi3Model"),
diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index 70014ec75bc3..e9f7e77b3235 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -48,27 +48,7 @@ class PerceptionLMConfig(PretrainedConfig):
             The image token index to encode the image prompt.
         video_token_id (`int`, *optional*, defaults to 128003):
             The video token index to encode the video prompt.
-
-    Example:
-
-    ```python
-    >>> from transformers import PerceptionLMForConditionalGeneration, PerceptionLMConfig, TimmWrapperConfig, LlamaConfig
-
-    >>> # Initializing a PerceptionEncoder config
-    >>> vision_config = TimmWrapperConfig()
-
-    >>> # Initializing a Llama config
-    >>> text_config = LlamaConfig()
-
-    >>> # Initializing a PerceptionLM perception_lm-1.5-7b style configuration
-    >>> configuration = PerceptionLMConfig(vision_config, text_config)
-
-    >>> # Initializing a model from the perception_lm-1.5-7b style configuration
-    >>> model = PerceptionLMForConditionalGeneration(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
+    """
 
     model_type = "perception_lm"
     sub_configs = {"text_config": AutoConfig, "vision_config": TimmWrapperConfig}
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index d3db3782eacd..8a5fedc50b42 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -263,10 +263,11 @@ def _preprocess(
         tile_size: int,
         max_num_tiles: int,
         return_tensors: Optional[Union[str, TensorType]],
+        disable_grouping: bool,
         **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs],
     ) -> BatchFeature:
         # Group images by size for batched transformation
-        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=False)
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
         resized_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
             if do_resize:
@@ -283,7 +284,7 @@ def _preprocess(
             resized_images_grouped[shape] = stacked_images
         resized_images = reorder_images(resized_images_grouped, grouped_images_index)
 
-        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=False)
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
         processed_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
             # Fused rescale and normalize
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index d313c7259c18..b3cd3d425bdb 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -234,37 +234,6 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ) -> Union[tuple, PerceptionLMModelOutputWithPast]:
-        r"""
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of input sequence tokens in the vocabulary.
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_tiles, channels, height, width)`, *optional*):
-            Pixel values for input images.
-        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, channels, height, width)`, *optional*):
-            Pixel values for input videos.
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence token in the position embeddings.
-        past_key_values (`List[torch.FloatTensor]`, *optional*):
-            List of precomputed key and value hidden states for each layer, used for fast autoregressive generation.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
-        use_cache (`bool`, *optional*):
-            Whether or not to use past key values to speed up decoding.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers.
-        cache_position (`torch.LongTensor`, *optional*):
-            Position indices for cached key/value states, used for efficient generation.
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the
-            sequence length dimension.
-        lm_kwargs:
-            Additional keyword arguments passed to the language model.
-        """
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 6cd9f5b27842..2f7f9b1794ed 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -167,37 +167,6 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ) -> Union[tuple, PerceptionLMModelOutputWithPast]:
-        r"""
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of input sequence tokens in the vocabulary.
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_tiles, channels, height, width)`, *optional*):
-            Pixel values for input images.
-        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, channels, height, width)`, *optional*):
-            Pixel values for input videos.
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence token in the position embeddings.
-        past_key_values (`List[torch.FloatTensor]`, *optional*):
-            List of precomputed key and value hidden states for each layer, used for fast autoregressive generation.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
-        use_cache (`bool`, *optional*):
-            Whether or not to use past key values to speed up decoding.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers.
-        cache_position (`torch.LongTensor`, *optional*):
-            Position indices for cached key/value states, used for efficient generation.
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the
-            sequence length dimension.
-        lm_kwargs:
-            Additional keyword arguments passed to the language model.
-        """
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 351e4eb4cdea..7dc1dc1ea371 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -1,6 +1,5 @@
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team.
-#
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 341dde48f0d6..0487e1def262 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -301,7 +301,6 @@
     "OwlViTForObjectDetection",
     "PatchTSMixerForPrediction",
     "PatchTSMixerForPretraining",
-    "PerceptionLMModel",
     "RagModel",
     "RagSequenceForGeneration",
     "RagTokenForGeneration",

From 97fbfcabf7ceb1b27f9d655de293dccc0ff9a701 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 2 Jul 2025 16:12:58 +0000
Subject: [PATCH 62/65] Revert some minor changes per reviewer feedback.

---
 .../configuration_perception_lm.py            |  1 -
 .../perception_lm/modular_perception_lm.py    | 76 +++++++++++++++++++
 .../test_modeling_perception_lm.py            | 31 ++++++++
 3 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/perception_lm/configuration_perception_lm.py b/src/transformers/models/perception_lm/configuration_perception_lm.py
index e9f7e77b3235..12352967d7c7 100644
--- a/src/transformers/models/perception_lm/configuration_perception_lm.py
+++ b/src/transformers/models/perception_lm/configuration_perception_lm.py
@@ -71,7 +71,6 @@ def __init__(
             vision_config = vision_config
         elif vision_config is None:
             vision_config = TimmWrapperConfig()
-        vision_config._attn_implementation_autoset = True
         self.vision_config = vision_config
         self.vision_use_cls_token = vision_use_cls_token
 
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 2f7f9b1794ed..4e84eb0bece3 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -167,6 +167,43 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ) -> Union[tuple, PerceptionLMModelOutputWithPast]:
+        """
+        Forward pass of the PerceptionLM model.
+
+        Args:
+            input_ids (`torch.LongTensor`, *optional*):
+                Indices of input sequence tokens in the vocabulary.
+            pixel_values (`torch.FloatTensor`, *optional*):
+                Input image tensor of shape `(batch_size, num_tiles, channels, height, width)`.
+            pixel_values_videos (`torch.FloatTensor`, *optional*):
+                Input video tensor of shape `(batch_size, num_frames, channels, height, width)`.
+            attention_mask (`torch.Tensor`, *optional*):
+                Mask to avoid performing attention on padding token indices.
+            position_ids (`torch.LongTensor`, *optional*):
+                Indices of positions of each input sequence token in the position embeddings.
+            past_key_values (`list[torch.FloatTensor]`, *optional*):
+                Precomputed key and value hidden states for fast autoregressive generation.
+            inputs_embeds (`torch.FloatTensor`, *optional*):
+                Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
+            use_cache (`bool`, *optional*):
+                Whether or not to use past key values to speed up decoding.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cache_position (`torch.LongTensor`, *optional*):
+                Position indices for caching.
+            logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0):
+                Number of logits to keep.
+            **lm_kwargs:
+                Additional keyword arguments for the language model.
+
+        Returns:
+            [`PerceptionLMModelOutputWithPast`] or `tuple`:
+                Model outputs as a `PerceptionLMModelOutputWithPast` if `return_dict=True`, otherwise a tuple.
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -319,6 +356,45 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
+        """
+        Forward pass for the PerceptionLMForConditionalGeneration model.
+
+        Args:
+            input_ids (`torch.LongTensor`, *optional*):
+                Indices of input sequence tokens in the vocabulary.
+            pixel_values (`torch.FloatTensor`, *optional*):
+                Input image tensor of shape `(batch_size, num_tiles, channels, height, width)`.
+            pixel_values_videos (`torch.FloatTensor`, *optional*):
+                Input video tensor of shape `(batch_size, num_frames, channels, height, width)`.
+            attention_mask (`torch.Tensor`, *optional*):
+                Mask to avoid performing attention on padding token indices.
+            position_ids (`torch.LongTensor`, *optional*):
+                Indices of positions of each input sequence token in the position embeddings.
+            past_key_values (`list[torch.FloatTensor]`, *optional*):
+                Precomputed key and value hidden states for fast autoregressive generation.
+            inputs_embeds (`torch.FloatTensor`, *optional*):
+                Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
+            labels (`torch.LongTensor`, *optional*):
+                Labels for computing the language modeling loss.
+            use_cache (`bool`, *optional*):
+                Whether or not to use past key values to speed up decoding.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cache_position (`torch.LongTensor`, *optional*):
+                Position indices for caching.
+            logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0):
+                Number of logits to keep.
+            **lm_kwargs:
+                Additional keyword arguments for the language model.
+
+        Returns:
+            [`PerceptionLMCausalLMOutputWithPast`] or `tuple`:
+                Model outputs as a `PerceptionLMCausalLMOutputWithPast` if `return_dict=True`, otherwise a tuple.
+        """
         outputs = self.model(
             input_ids=input_ids,
             pixel_values=pixel_values,
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index 1ca67a3073b4..7ffc05451e9b 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -328,6 +328,37 @@ def test_generate_from_inputs_embeds_1_beam_search(self):
     def test_generate_from_inputs_embeds_with_static_cache(self):
         pass
 
+    ## Skip flash attention releated tests below
+    ## correct configuration:
+    ## from_pretrained(model_id, attn_implementation={"text_config": "flash_attention_2", "vision_config": "eager"}
+    @unittest.skip("Flash attn test is not configured correctly as we need to configure vision/timm model to 'eager'.")
+    def test_eager_matches_fa2_generate(self):
+        pass
+
+    @unittest.skip("Flash attn test is not configured correctly as we need to configure vision/timm model to 'eager'.")
+    def test_flash_attn_2_fp32_ln(self):
+        pass
+
+    @unittest.skip("Flash attn test is not configured correctly as we need to configure vision/timm model to 'eager'.")
+    def test_flash_attn_2_from_config(self):
+        pass
+
+    @unittest.skip("Flash attn test is not configured correctly as we need to configure vision/timm model to 'eager'.")
+    def test_eager_matches_sdpa_generate_with_dynamic_cache(self):
+        pass
+
+    @unittest.skip("Flash attn test is not configured correctly as we need to configure vision/timm model to 'eager'.")
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        pass
+
+    @unittest.skip("Flash attn test is not configured correctly as we need to configure vision/timm model to 'eager'.")
+    def test_eager_matches_sdpa_generate(self):
+        pass
+
+    @unittest.skip("Flash attn test is not configured correctly as we need to configure vision/timm model to 'eager'.")
+    def test_flash_attn_2_inference_equivalence(self):
+        pass
+
 
 TEST_MODEL_PATH = "shumingh/plm_1b_hf"
 

From 99a1bf26db89048ed2ff2dadf69bdb7f8d2587aa Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Wed, 2 Jul 2025 16:39:54 +0000
Subject: [PATCH 63/65] update base_model_prefix

---
 .../perception_lm/modeling_perception_lm.py   | 78 ++++++++++++++++++-
 .../perception_lm/modular_perception_lm.py    |  2 +-
 2 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index b3cd3d425bdb..942e8f0cdd56 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -90,7 +90,7 @@ def forward(self, features):
 @auto_docstring
 class PerceptionLMPreTrainedModel(PreTrainedModel):
     config_class = PerceptionLMConfig
-    base_model_prefix = ""
+    base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _skip_keys_device_placement = "past_key_values"
     _supports_cache_class = True
@@ -234,6 +234,43 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ) -> Union[tuple, PerceptionLMModelOutputWithPast]:
+        """
+        Forward pass of the PerceptionLM model.
+
+        Args:
+            input_ids (`torch.LongTensor`, *optional*):
+                Indices of input sequence tokens in the vocabulary.
+            pixel_values (`torch.FloatTensor`, *optional*):
+                Input image tensor of shape `(batch_size, num_tiles, channels, height, width)`.
+            pixel_values_videos (`torch.FloatTensor`, *optional*):
+                Input video tensor of shape `(batch_size, num_frames, channels, height, width)`.
+            attention_mask (`torch.Tensor`, *optional*):
+                Mask to avoid performing attention on padding token indices.
+            position_ids (`torch.LongTensor`, *optional*):
+                Indices of positions of each input sequence token in the position embeddings.
+            past_key_values (`list[torch.FloatTensor]`, *optional*):
+                Precomputed key and value hidden states for fast autoregressive generation.
+            inputs_embeds (`torch.FloatTensor`, *optional*):
+                Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
+            use_cache (`bool`, *optional*):
+                Whether or not to use past key values to speed up decoding.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cache_position (`torch.LongTensor`, *optional*):
+                Position indices for caching.
+            logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0):
+                Number of logits to keep.
+            **lm_kwargs:
+                Additional keyword arguments for the language model.
+
+        Returns:
+            [`PerceptionLMModelOutputWithPast`] or `tuple`:
+                Model outputs as a `PerceptionLMModelOutputWithPast` if `return_dict=True`, otherwise a tuple.
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -394,6 +431,45 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
+        """
+        Forward pass for the PerceptionLMForConditionalGeneration model.
+
+        Args:
+            input_ids (`torch.LongTensor`, *optional*):
+                Indices of input sequence tokens in the vocabulary.
+            pixel_values (`torch.FloatTensor`, *optional*):
+                Input image tensor of shape `(batch_size, num_tiles, channels, height, width)`.
+            pixel_values_videos (`torch.FloatTensor`, *optional*):
+                Input video tensor of shape `(batch_size, num_frames, channels, height, width)`.
+            attention_mask (`torch.Tensor`, *optional*):
+                Mask to avoid performing attention on padding token indices.
+            position_ids (`torch.LongTensor`, *optional*):
+                Indices of positions of each input sequence token in the position embeddings.
+            past_key_values (`list[torch.FloatTensor]`, *optional*):
+                Precomputed key and value hidden states for fast autoregressive generation.
+            inputs_embeds (`torch.FloatTensor`, *optional*):
+                Optionally, instead of passing `input_ids`, you can choose to directly pass an embedded representation.
+            labels (`torch.LongTensor`, *optional*):
+                Labels for computing the language modeling loss.
+            use_cache (`bool`, *optional*):
+                Whether or not to use past key values to speed up decoding.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cache_position (`torch.LongTensor`, *optional*):
+                Position indices for caching.
+            logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0):
+                Number of logits to keep.
+            **lm_kwargs:
+                Additional keyword arguments for the language model.
+
+        Returns:
+            [`PerceptionLMCausalLMOutputWithPast`] or `tuple`:
+                Model outputs as a `PerceptionLMCausalLMOutputWithPast` if `return_dict=True`, otherwise a tuple.
+        """
         outputs = self.model(
             input_ids=input_ids,
             pixel_values=pixel_values,
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 4e84eb0bece3..2f704ad68d36 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -99,7 +99,7 @@ def forward(self, features):
 
 
 class PerceptionLMPreTrainedModel(LlavaPreTrainedModel):
-    base_model_prefix = ""
+    base_model_prefix = "model"
 
 
 class PerceptionLMModelOutputWithPast(LlavaModelOutputWithPast):

From 73c919af45ff7b4d116adaa4f87a9d5317cf4382 Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Sat, 5 Jul 2025 05:15:33 +0000
Subject: [PATCH 64/65] address reviewer feedback

---
 .../convert_perception_lm_weights_to_hf.py    |  8 +--
 .../perception_lm/modeling_perception_lm.py   | 52 +++++++----------
 .../perception_lm/modular_perception_lm.py    | 58 +++++++------------
 .../test_modeling_perception_lm.py            |  4 +-
 4 files changed, 49 insertions(+), 73 deletions(-)

diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
index 6d87b758c31d..ee96c86876dd 100644
--- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
+++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py
@@ -298,10 +298,10 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
         state_dict = {
             "model.language_model.embed_tokens.weight": loaded["tok_embeddings.weight"],
             "model.language_model.norm.weight": loaded["norm.weight"],
-            "model.multi_modal_projector.projector.0.weight": loaded["vision_projector.projector.0.weight"],
-            "model.multi_modal_projector.projector.2.weight": loaded["vision_projector.projector.2.weight"],
-            "model.multi_modal_projector.projector.0.bias": loaded["vision_projector.projector.0.bias"],
-            "model.multi_modal_projector.projector.2.bias": loaded["vision_projector.projector.2.bias"],
+            "model.multi_modal_projector.linear_1.weight": loaded["vision_projector.projector.0.weight"],
+            "model.multi_modal_projector.linear_2.weight": loaded["vision_projector.projector.2.weight"],
+            "model.multi_modal_projector.linear_1.bias": loaded["vision_projector.projector.0.bias"],
+            "model.multi_modal_projector.linear_2.bias": loaded["vision_projector.projector.2.bias"],
         }
         if not tie_word_embeddings:
             state_dict["lm_head.weight"] = loaded["output.weight"]
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 942e8f0cdd56..a2b6cf08d423 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -26,8 +26,7 @@
 import torch.nn.functional as F
 from torch import nn
 
-from transformers.generation.utils import GenerationMixin
-
+from ...generation.utils import GenerationMixin
 from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, can_return_tuple
@@ -35,9 +34,9 @@
 from .configuration_perception_lm import PerceptionLMConfig
 
 
-class AdaptiveAvgPooling(nn.Module):
+class PerceptionLMAdaptiveAvgPooling(nn.Module):
     def __init__(self, pooling_ratio=2):
-        super(AdaptiveAvgPooling, self).__init__()
+        super().__init__()
         self.pooling_ratio = pooling_ratio
 
     def forward(self, hidden_states):
@@ -59,29 +58,28 @@ def __init__(self, config: PerceptionLMConfig):
         super().__init__()
         input_size = config.vision_config.model_args["embed_dim"]
         output_size = config.text_config.hidden_size
-        self.projector = nn.ModuleList(
-            [
-                nn.Linear(
-                    in_features=input_size,
-                    out_features=output_size,
-                    bias=True,
-                ),
-                nn.GELU(),
-                nn.Linear(
-                    in_features=output_size,
-                    out_features=output_size,
-                    bias=True,
-                ),
-            ]
+        self.linear_1 = nn.Linear(
+            in_features=input_size,
+            out_features=output_size,
+            bias=True,
+        )
+        self.gelu = nn.GELU()
+        self.linear_2 = nn.Linear(
+            in_features=output_size,
+            out_features=output_size,
+            bias=True,
         )
         self.pooling = (
-            AdaptiveAvgPooling(config.projector_pooling_ratio) if config.projector_pooling_ratio > 1 else nn.Identity()
+            PerceptionLMAdaptiveAvgPooling(config.projector_pooling_ratio)
+            if config.projector_pooling_ratio > 1
+            else nn.Identity()
         )
 
     def forward(self, features):
         features = features.permute(1, 0, 2)  # NLD -> LND
-        for layer in self.projector:
-            features = layer(features)
+        features = self.linear_1(features)
+        features = self.gelu(features)
+        features = self.linear_2(features)
         features = features.permute(1, 0, 2)  # LND -> NLD
         features = self.pooling(features)
         return features
@@ -173,13 +171,13 @@ class PerceptionLMCausalLMOutputWithPast(ModelOutput):
 
 @auto_docstring
 class PerceptionLMModel(PerceptionLMPreTrainedModel):
-    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+    _checkpoint_conversion_mapping = {}
 
     def __init__(self, config: PerceptionLMConfig):
         super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
         self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
         self.language_model = AutoModel.from_config(config.text_config)
-        self.vision_tower = AutoModel.from_config(config.vision_config)
         self.post_init()
 
     def get_input_embeddings(self):
@@ -229,7 +227,6 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
@@ -258,8 +255,6 @@ def forward(
                 Whether or not to return the attentions tensors of all attention layers.
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
             cache_position (`torch.LongTensor`, *optional*):
                 Position indices for caching.
             logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0):
@@ -275,7 +270,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
@@ -426,7 +420,6 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
@@ -457,8 +450,6 @@ def forward(
                 Whether or not to return the attentions tensors of all attention layers.
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
             cache_position (`torch.LongTensor`, *optional*):
                 Position indices for caching.
             logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0):
@@ -481,7 +472,6 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
             **lm_kwargs,
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 2f704ad68d36..1b001fea6752 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -21,8 +21,7 @@
 import torch.utils.checkpoint
 from torch import nn
 
-from transformers.generation.utils import GenerationMixin
-
+from ...generation.utils import GenerationMixin
 from ...utils import (
     auto_docstring,
     can_return_tuple,
@@ -40,15 +39,10 @@
 
 logger = logging.get_logger(__name__)
 
-_CONFIG_FOR_DOC = "PerceptionLMConfig"
-
-# Base docstring
-_CHECKPOINT_FOR_DOC = "facebook/Perception-LM-1B"
-
 
-class AdaptiveAvgPooling(nn.Module):
+class PerceptionLMAdaptiveAvgPooling(nn.Module):
     def __init__(self, pooling_ratio=2):
-        super(AdaptiveAvgPooling, self).__init__()
+        super().__init__()
         self.pooling_ratio = pooling_ratio
 
     def forward(self, hidden_states):
@@ -70,29 +64,28 @@ def __init__(self, config: PerceptionLMConfig):
         super().__init__()
         input_size = config.vision_config.model_args["embed_dim"]
         output_size = config.text_config.hidden_size
-        self.projector = nn.ModuleList(
-            [
-                nn.Linear(
-                    in_features=input_size,
-                    out_features=output_size,
-                    bias=True,
-                ),
-                nn.GELU(),
-                nn.Linear(
-                    in_features=output_size,
-                    out_features=output_size,
-                    bias=True,
-                ),
-            ]
+        self.linear_1 = nn.Linear(
+            in_features=input_size,
+            out_features=output_size,
+            bias=True,
+        )
+        self.gelu = nn.GELU()
+        self.linear_2 = nn.Linear(
+            in_features=output_size,
+            out_features=output_size,
+            bias=True,
         )
         self.pooling = (
-            AdaptiveAvgPooling(config.projector_pooling_ratio) if config.projector_pooling_ratio > 1 else nn.Identity()
+            PerceptionLMAdaptiveAvgPooling(config.projector_pooling_ratio)
+            if config.projector_pooling_ratio > 1
+            else nn.Identity()
         )
 
     def forward(self, features):
         features = features.permute(1, 0, 2)  # NLD -> LND
-        for layer in self.projector:
-            features = layer(features)
+        features = self.linear_1(features)
+        features = self.gelu(features)
+        features = self.linear_2(features)
         features = features.permute(1, 0, 2)  # LND -> NLD
         features = self.pooling(features)
         return features
@@ -112,9 +105,10 @@ class PerceptionLMCausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
 
 @auto_docstring
 class PerceptionLMModel(LlavaModel):
+    _checkpoint_conversion_mapping = {}
+
     def __init__(self, config: PerceptionLMConfig):
         super().__init__(config)
-        del self.vision_tower
         self.vision_tower = AutoModel.from_config(config.vision_config)
         self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
         self.language_model = AutoModel.from_config(config.text_config)
@@ -162,7 +156,6 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
@@ -191,8 +184,6 @@ def forward(
                 Whether or not to return the attentions tensors of all attention layers.
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
             cache_position (`torch.LongTensor`, *optional*):
                 Position indices for caching.
             logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0):
@@ -208,7 +199,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
@@ -274,7 +264,7 @@ def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.post_init()
 
-    # Make modules available throught conditional class for BC
+    # Make modules available throught conditional class for BC with test_sdpa_can_dispatch_composite_models
     @property
     def language_model(self):
         return self.model.language_model
@@ -351,7 +341,6 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
@@ -382,8 +371,6 @@ def forward(
                 Whether or not to return the attentions tensors of all attention layers.
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
             cache_position (`torch.LongTensor`, *optional*):
                 Position indices for caching.
             logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0):
@@ -406,7 +393,6 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
             **lm_kwargs,
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
index 7ffc05451e9b..16f521d70f60 100644
--- a/tests/models/perception_lm/test_modeling_perception_lm.py
+++ b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -308,11 +308,11 @@ def test_flash_attn_2_can_dispatch_composite_models(self):
     def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
         pass
 
-    @unittest.skip("ViT PE cannot be tested with meta device")
+    @unittest.skip("ViT PE / TimmWrapperModel cannot be tested with meta device")
     def test_can_be_initialized_on_meta(self):
         pass
 
-    @unittest.skip("ViT PE cannot be tested with meta device")
+    @unittest.skip("ViT PE / TimmWrapperModel cannot be tested with meta device")
     def test_can_load_with_meta_device_context_manager(self):
         pass
 

From 423af521edf642d71eb1e81d47cd9e9a897156ba Mon Sep 17 00:00:00 2001
From: Shuming Hu <shumingh@gmail.com>
Date: Sat, 5 Jul 2025 05:22:30 +0000
Subject: [PATCH 65/65] fix comment in modeling file

---
 src/transformers/models/perception_lm/modeling_perception_lm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index a2b6cf08d423..b7507b1343fa 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -343,7 +343,7 @@ def __init__(self, config: PerceptionLMConfig, **super_kwargs):
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.post_init()
 
-    # Make modules available throught conditional class for BC
+    # Make modules available throught conditional class for BC with test_sdpa_can_dispatch_composite_models
     @property
     def language_model(self):
         return self.model.language_model