Set up auto doc to messages and chat models

kcz358 · kcz358 · commit 388388283970 · 2025-05-27T21:46:18.000+08:00
diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
@@ -420,23 +420,6 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
     elif args.tasks == "list_subtasks":
         eval_logger.info(task_manager.list_all_tasks(list_groups=False, list_tags=False))
         sys.exit()
-    elif args.tasks == "list_with_num":
-        log_message = (
-            "\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70
-        )
-        eval_logger.info(log_message)
-        for task_name in sorted(task_manager.list_all_tasks()):
-            try:
-                task_dict = get_task_dict([task_name], model_name="llava")
-                task_obj = task_dict[task_name]
-                if type(task_obj) == tuple:
-                    group, task_obj = task_obj
-                    if task_obj is None:
-                        continue
-                eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")
-            except Exception as e:
-                eval_logger.debug(f"\nTask : {task_name} fail to load \n Exception : \n {e}")
-        sys.exit()
     else:
         if os.path.isdir(args.tasks):
             import glob
diff --git a/lmms_eval/api/model.py b/lmms_eval/api/model.py
@@ -15,6 +15,8 @@
 
 
 class lmms(abc.ABC):
+    is_simple: bool = True
+
     def __init__(self) -> None:
         """Defines the interface that should be implemented by all lmms subclasses.
         lmmss are assumed to take image-text as input and yield strings as output
diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
@@ -32,6 +32,7 @@
 from datasets import Audio, DownloadConfig, Image, Sequence
 from huggingface_hub import snapshot_download
 from loguru import logger as eval_logger
+from PIL import Image as PIL_Image
 from PIL import ImageFile
 from tenacity import retry, stop_after_attempt, stop_after_delay, wait_fixed
 from tqdm import tqdm
@@ -91,6 +92,7 @@ class TaskConfig(dict):
     doc_to_text: Union[Callable, str] = None
     doc_to_target: Union[Callable, str] = None
     doc_to_choice: Union[Callable, str, dict, list] = None
+    doc_to_messages: Callable = None
     process_results: Union[Callable, str] = None
     use_prompt: str = None
     description: str = ""
@@ -1634,3 +1636,41 @@ def task_name(self) -> Any:
 
     def __repr__(self):
         return f"ConfigurableTask(task_name={getattr(self.config, 'task', None)}," f"output_type={self.OUTPUT_TYPE}," f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," f"num_samples={len(self.eval_docs)})"
+
+
+class ConfigurableMessagesTask(ConfigurableTask):
+    def doc_to_messages(self, doc: dict) -> Union[int, str, list]:
+        if callable(self.config.doc_to_messages):
+            return (
+                self.config.doc_to_messages(doc, self.lmms_eval_specific_kwargs)
+                if self.lmms_eval_specific_kwargs is not None and len(inspect.signature(self.config.doc_to_messages).parameters) == 2
+                else self.config.doc_to_messages(
+                    doc,
+                )
+            )
+        elif self.config.doc_to_messages is None and self.config.doc_to_visual is not None and self.config.doc_to_text is not None:
+            # An auto doc to messages function
+            def auto_doc_to_messages(doc):
+                visuals = self.doc_to_visual(doc)
+                text = self.doc_to_text(doc)
+                messages = [{"role": "user", "content": []}]
+                content = []
+                for visual in visuals:
+                    if isinstance(visual, PIL_Image.Image):
+                        content.append({"type": "image", "url": visual})
+                content.append({"type": "text", "text": text})
+                messages[0]["content"] = content
+                return messages
+
+            return auto_doc_to_messages(doc)
+        else:
+            # eval_logger.warning("Note that doc_to_visual was called but not set in config. Please check if this is a text-only task.")
+            return self.config.doc_to_messages
+
+    def construct_requests(self, doc_id: int, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
+        split = kwargs.get("metadata").get("split")
+        # kwargs.pop("split")
+        assert self.OUTPUT_TYPE == "generate_until", "Currently messages is used for generation only"
+
+        arguments = (self.doc_to_messages, copy.deepcopy(self.config.generation_kwargs), doc_id, self.config.task, split)
+        return Instance(request_type=self.OUTPUT_TYPE, arguments=arguments, idx=0, **kwargs)
diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
@@ -172,8 +172,6 @@ def simple_evaluate(
     if task_manager is None:
         task_manager = TaskManager(verbosity, model_name=model)
 
-    task_dict = get_task_dict(tasks, task_manager)
-
     if isinstance(model, str):
         if model_args is None:
             model_args = ""
@@ -187,6 +185,8 @@ def simple_evaluate(
         )
     elif isinstance(model, lmms_eval.api.model.lmms):
         lm = model
+    task_type = "simple" if lm.is_simple else "chat"
+    task_dict = get_task_dict(tasks, task_manager, task_type)
 
     # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
     # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
@@ -551,8 +551,7 @@ def evaluate(
                                 ensure_ascii=False,
                             )
                         ),
-                        "prompt_hash": hash_string(requests[0].arguments[0]),
-                        "target_hash": hash_string(str(target)),
+                        # Removing prompt hash and target hash here
                     }
                     example.update(metrics)
                     task_output.logged_samples.append(example)
diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
@@ -1,6 +1,7 @@
 import importlib
 import os
 import sys
+from typing import Literal
 
 import hf_transfer
 from loguru import logger
@@ -10,7 +11,8 @@
 logger.remove()
 logger.add(sys.stdout, level="WARNING")
 
-AVAILABLE_MODELS = {
+
+AVAILABLE_SIMPLE_MODELS = {
     "aero": "Aero",
     "plm": "PerceptionLM",
     "aria": "Aria",
@@ -75,14 +77,23 @@
     "vora": "VoRA",
 }
 
+AVAILABLE_CHAT_TEMPLATE_MODELS = {"llava_hf": "LlavaHf"}
+
 
 def get_model(model_name):
-    if model_name not in AVAILABLE_MODELS:
+    if model_name not in AVAILABLE_SIMPLE_MODELS and model_name not in AVAILABLE_CHAT_TEMPLATE_MODELS:
         raise ValueError(f"Model {model_name} not found in available models.")
 
+    if model_name in AVAILABLE_CHAT_TEMPLATE_MODELS:
+        model_type = "chat"
+        AVAILABLE_MODELS = AVAILABLE_CHAT_TEMPLATE_MODELS
+    else:
+        model_type = "simple"
+        AVAILABLE_MODELS = AVAILABLE_SIMPLE_MODELS
+
     model_class = AVAILABLE_MODELS[model_name]
     if "." not in model_class:
-        model_class = f"lmms_eval.models.{model_name}.{model_class}"
+        model_class = f"lmms_eval.models.{model_type}.{model_name}.{model_class}"
 
     try:
         model_module, model_class = model_class.rsplit(".", 1)
@@ -97,5 +108,6 @@ def get_model(model_name):
     # Allow specifying other packages to import models from
     for plugin in os.environ["LMMS_EVAL_PLUGINS"].split(","):
         m = importlib.import_module(f"{plugin}.models")
+        # For plugin users, this will be replaced by chat template model later
         for model_name, model_class in getattr(m, "AVAILABLE_MODELS").items():
-            AVAILABLE_MODELS[model_name] = f"{plugin}.models.{model_name}.{model_class}"
+            AVAILABLE_SIMPLE_MODELS[model_name] = f"{plugin}.models.{model_name}.{model_class}"
diff --git a/lmms_eval/models/chat/llava_hf.py b/lmms_eval/models/chat/llava_hf.py
@@ -0,0 +1,122 @@
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+from accelerate import Accelerator, DistributedType
+from accelerate.state import AcceleratorState
+from decord import VideoReader, cpu
+from tqdm import tqdm
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    LlavaForConditionalGeneration,
+    LlavaNextForConditionalGeneration,
+)
+
+from lmms_eval import utils
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+from lmms_eval.protocol import ChatMessages
+
+warnings.filterwarnings("ignore")
+
+from loguru import logger as eval_logger
+
+from lmms_eval.api.registry import register_model
+from lmms_eval.models.simple.llava_hf import LlavaHf as LlavaHfSimple
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+
+# Default chat for llava-hf/llava-1.5 models: https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0
+VICUNA_CHAT_TEMPLATE = "{% for message in messages %}{% if loop.index0 == 0 %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {{ message['content'] }} {% elif message['role'] == 'user' %}USER: {{ message['content'] }} {% else %} ASSISTANT: {{ message['content'] }}{{ eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"
+
+
+@register_model("llava_hf_chat")
+class LlavaHf(LlavaHfSimple):
+    is_simple = False
+
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        res = []
+
+        # A dummy collate here to sort by doc id
+        def _collate(x):
+            return x[2], x[2]
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True)
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1
+        pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding")
+        for chunk in chunks:
+            doc_to_messages, all_gen_kwargs, doc_id, task, split = zip(*chunk)
+            task = task[0]
+            split = split[0]
+            chat_messages = [doc_to_messages[0](self.task_dict[task][split][ids]) for ids in doc_id]
+            chat_messages: List[ChatMessages] = [ChatMessages(**{"messages": message}) for message in chat_messages]
+            visuals = []
+            videos = []
+            for messages in chat_messages:
+                visual, video, _ = messages.extract_media()
+                visuals.append(visual)
+                videos.append(video)
+            visuals = self.flatten(visuals)
+            videos = self.flatten(videos)
+            assert self.batch_size_per_gpu == 1, "Do not support batch_size_per_gpu > 1 for now"
+
+            # Apply chat template
+            messages = chat_messages[0].model_dump()["messages"]
+            text = self._image_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            if self.accelerator.is_main_process and doc_id[0] % 100 == 0:
+                eval_logger.debug(f"Prompt for doc ID {doc_id[0]}:\n\n{text}\n")
+
+            if len(videos) == 0:
+                videos = None
+            inputs = self._image_processor(images=visuals, videos=videos, text=text, return_tensors="pt").to(self._device, self.model.dtype)
+
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+
+            gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
+            if "max_new_tokens" not in gen_kwargs:
+                gen_kwargs["max_new_tokens"] = 1024
+            if "temperature" not in gen_kwargs:
+                gen_kwargs["temperature"] = 0
+            if "top_p" not in gen_kwargs:
+                gen_kwargs["top_p"] = None
+            if "num_beams" not in gen_kwargs:
+                gen_kwargs["num_beams"] = 1
+            try:
+                cont = self.model.generate(
+                    **inputs,
+                    do_sample=True if gen_kwargs["temperature"] > 0 else False,
+                    temperature=gen_kwargs["temperature"],
+                    top_p=gen_kwargs["top_p"],
+                    num_beams=gen_kwargs["num_beams"],
+                    max_new_tokens=gen_kwargs["max_new_tokens"],
+                    use_cache=self.use_cache,
+                    pad_token_id=self.eot_token_id,
+                    eos_token_id=self.eot_token_id,
+                )
+                cont = cont[:, inputs["input_ids"].shape[-1] :]
+            except Exception as e:
+                eval_logger.error(f"Error {e} in generating")
+                cont = ""
+            text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
+            if self.accelerator.is_main_process and doc_id[0] % 100 == 0:
+                eval_logger.debug(f"Generated text for doc ID {doc_id[0]}:\n\n{text_outputs}\n")
+
+            res.append(text_outputs)
+            self.cache_hook.add_partial("generate_until", (text, gen_kwargs), text_outputs)
+            pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+
+        pbar.close()
+        return res
diff --git a/lmms_eval/protocol.py b/lmms_eval/protocol.py
@@ -0,0 +1,44 @@
+from typing import Any, Dict, List, Literal, Union
+
+from PIL import Image
+from pydantic import BaseModel
+
+
+class ChatTextContent(BaseModel):
+    type: Literal["text"] = "text"
+    text: str
+
+
+class ChatImageContent(BaseModel):
+    type: Literal["image"] = "image"
+    url: Any
+
+    def model_dump(self, **kwargs):
+        content = super().model_dump(**kwargs)
+        # Some model may need this placeholder for hf_chat_template
+        content["image_url"] = "placeholder"
+        return content
+
+
+ChatContent = Union[ChatTextContent, ChatImageContent]
+
+
+class ChatMessage(BaseModel):
+    role: Literal["user", "system", "assistant"]
+    content: List[ChatContent]
+
+
+class ChatMessages(BaseModel):
+    messages: List[ChatMessage]
+
+    def extract_media(self):
+        images = []
+        videos = []
+        audios = []
+
+        for message in self.messages:
+            for content in message.content:
+                if content.type == "image":
+                    images.append(content.url)
+
+        return images, videos, audios
diff --git a/lmms_eval/tasks/__init__.py b/lmms_eval/tasks/__init__.py