EvolvingLMMs-Lab
diff --git a/‎lmms_eval/__main__.py
Lines changed: 2 additions & 17 deletions b/‎lmms_eval/__main__.py
Lines changed: 2 additions & 17 deletions
diff --git a/‎lmms_eval/api/model.py
Lines changed: 2 additions & 0 deletions b/‎lmms_eval/api/model.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎lmms_eval/api/task.py
Lines changed: 45 additions & 1 deletion b/‎lmms_eval/api/task.py
Lines changed: 45 additions & 1 deletion
diff --git a/‎lmms_eval/evaluator.py
Lines changed: 5 additions & 5 deletions b/‎lmms_eval/evaluator.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎lmms_eval/models/__init__.py
Lines changed: 22 additions & 5 deletions b/‎lmms_eval/models/__init__.py
Lines changed: 22 additions & 5 deletions
diff --git a/‎lmms_eval/models/chat/llava_hf.py
Lines changed: 122 additions & 0 deletions b/‎lmms_eval/models/chat/llava_hf.py
Lines changed: 122 additions & 0 deletions
@@ -265,6 +265,7 @@ def parse_eval_args() -> argparse.Namespace:
         help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
     )
     parser.add_argument("--process_with_media", action="store_true", help="Whether you will process you dataset with audio, image. By default set to False" "In case some benchmarks need to be processed with media, set this flag to True.")
+    parser.add_argument("--force_simple", action="store_true", help="Force the evaluation to use the simple mode of the models")
     args = parser.parse_args()
     return args
 
@@ -421,23 +422,6 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
     elif args.tasks == "list_subtasks":
         eval_logger.info(task_manager.list_all_tasks(list_groups=False, list_tags=False))
         sys.exit()
-    elif args.tasks == "list_with_num":
-        log_message = (
-            "\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70
-        )
-        eval_logger.info(log_message)
-        for task_name in sorted(task_manager.list_all_tasks()):
-            try:
-                task_dict = get_task_dict([task_name], model_name="llava")
-                task_obj = task_dict[task_name]
-                if type(task_obj) == tuple:
-                    group, task_obj = task_obj
-                    if task_obj is None:
-                        continue
-                eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")
-            except Exception as e:
-                eval_logger.debug(f"\nTask : {task_name} fail to load \n Exception : \n {e}")
-        sys.exit()
     else:
         if os.path.isdir(args.tasks):
             import glob
@@ -496,6 +480,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
         fewshot_random_seed=args.seed[3],
         cli_args=args,
         datetime_str=datetime_str,
+        force_simple=args.force_simple,
         **request_caching_args,
     )
 
 
@@ -15,6 +15,8 @@
 
 
 class lmms(abc.ABC):
+    is_simple: bool = True
+
     def __init__(self) -> None:
         """Defines the interface that should be implemented by all lmms subclasses.
         lmmss are assumed to take image-text as input and yield strings as output
 
@@ -32,6 +32,7 @@
 from datasets import Audio, DownloadConfig, Image, Sequence
 from huggingface_hub import snapshot_download
 from loguru import logger as eval_logger
+from PIL import Image as PIL_Image
 from PIL import ImageFile
 from tenacity import retry, stop_after_attempt, stop_after_delay, wait_fixed
 from tqdm import tqdm
@@ -91,6 +92,7 @@ class TaskConfig(dict):
     doc_to_text: Union[Callable, str] = None
     doc_to_target: Union[Callable, str] = None
     doc_to_choice: Union[Callable, str, dict, list] = None
+    doc_to_messages: Callable = None
     process_results: Union[Callable, str] = None
     use_prompt: str = None
     description: str = ""
@@ -952,7 +954,7 @@ def _download_from_youtube(path):
                     )
                     zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True)
                     tar_files = glob(os.path.join(cache_path, "**/*.tar*"), recursive=True)
-                    
+
                     def unzip_video_data(zip_file):
                         import os
                         import zipfile
@@ -1634,3 +1636,45 @@ def task_name(self) -> Any:
 
     def __repr__(self):
         return f"ConfigurableTask(task_name={getattr(self.config, 'task', None)}," f"output_type={self.OUTPUT_TYPE}," f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," f"num_samples={len(self.eval_docs)})"
+
+
+class ConfigurableMessagesTask(ConfigurableTask):
+    def doc_to_messages(self, doc: dict) -> Union[int, str, list]:
+        if callable(self.config.doc_to_messages):
+            return (
+                self.config.doc_to_messages(doc, self.lmms_eval_specific_kwargs)
+                if self.lmms_eval_specific_kwargs is not None and len(inspect.signature(self.config.doc_to_messages).parameters) == 2
+                else self.config.doc_to_messages(
+                    doc,
+                )
+            )
+        elif self.config.doc_to_messages is None and self.config.doc_to_visual is not None and self.config.doc_to_text is not None:
+            # An auto doc to messages function
+            def auto_doc_to_messages(doc):
+                visuals = self.doc_to_visual(doc)
+                text = self.doc_to_text(doc)
+                messages = [{"role": "user", "content": []}]
+                content = []
+                for visual in visuals:
+                    if isinstance(visual, PIL_Image.Image):
+                        content.append({"type": "image", "url": visual})
+                    elif isinstance(visual, dict):
+                        content.append({"type": "audio", "url": visual})
+                    elif isinstance(visual, str):
+                        content.append({"type": "video", "url": visual})
+                content.append({"type": "text", "text": text})
+                messages[0]["content"] = content
+                return messages
+
+            return auto_doc_to_messages(doc)
+        else:
+            # eval_logger.warning("Note that doc_to_visual was called but not set in config. Please check if this is a text-only task.")
+            return self.config.doc_to_messages
+
+    def construct_requests(self, doc_id: int, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
+        split = kwargs.get("metadata").get("split")
+        # kwargs.pop("split")
+        assert self.OUTPUT_TYPE == "generate_until", "Currently messages is used for generation only"
+
+        arguments = (self.doc_to_messages, copy.deepcopy(self.config.generation_kwargs), doc_id, self.config.task, split)
+        return Instance(request_type=self.OUTPUT_TYPE, arguments=arguments, idx=0, **kwargs)
@@ -79,6 +79,7 @@ def simple_evaluate(
     datetime_str: str = get_datetime_str(),
     distributed_executor_backend: str = "accelerate",
     cli_args=None,
+    force_simple: bool = False,
 ):
     """Instantiate and evaluate a model on a list of tasks.
 
@@ -172,12 +173,10 @@ def simple_evaluate(
     if task_manager is None:
         task_manager = TaskManager(verbosity, model_name=model)
 
-    task_dict = get_task_dict(tasks, task_manager)
-
     if isinstance(model, str):
         if model_args is None:
             model_args = ""
-        lm = lmms_eval.models.get_model(model).create_from_arg_string(
+        lm = lmms_eval.models.get_model(model, force_simple).create_from_arg_string(
             model_args,
             {
                 "batch_size": batch_size,
@@ -187,6 +186,8 @@ def simple_evaluate(
         )
     elif isinstance(model, lmms_eval.api.model.lmms):
         lm = model
+    task_type = "simple" if lm.is_simple else "chat"
+    task_dict = get_task_dict(tasks, task_manager, task_type)
 
     # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
     # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
@@ -551,8 +552,7 @@ def evaluate(
                                 ensure_ascii=False,
                             )
                         ),
-                        "prompt_hash": hash_string(requests[0].arguments[0]),
-                        "target_hash": hash_string(str(target)),
+                        # Removing prompt hash and target hash here
                     }
                     example.update(metrics)
                     task_output.logged_samples.append(example)
 
@@ -1,6 +1,7 @@
 import importlib
 import os
 import sys
+from typing import Literal
 
 import hf_transfer
 from loguru import logger
@@ -12,7 +13,8 @@
 log_format = "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | " "<level>{level: <8}</level> | " "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - " "<level>{message}</level>"
 logger.add(sys.stdout, level="WARNING", format=log_format)
 
-AVAILABLE_MODELS = {
+
+AVAILABLE_SIMPLE_MODELS = {
     "aero": "Aero",
     "plm": "PerceptionLM",
     "aria": "Aria",
@@ -77,14 +79,28 @@
     "vora": "VoRA",
 }
 
+AVAILABLE_CHAT_TEMPLATE_MODELS = {"llava_hf": "LlavaHf"}
+
 
-def get_model(model_name):
-    if model_name not in AVAILABLE_MODELS:
+def get_model(model_name, force_simple: bool = False):
+    if model_name not in AVAILABLE_SIMPLE_MODELS and model_name not in AVAILABLE_CHAT_TEMPLATE_MODELS:
         raise ValueError(f"Model {model_name} not found in available models.")
 
+    if model_name in AVAILABLE_CHAT_TEMPLATE_MODELS:
+        model_type = "chat"
+        AVAILABLE_MODELS = AVAILABLE_CHAT_TEMPLATE_MODELS
+    else:
+        model_type = "simple"
+        AVAILABLE_MODELS = AVAILABLE_SIMPLE_MODELS
+
+    # Override with force_simple if needed
+    if force_simple:
+        model_type = "simple"
+        AVAILABLE_MODELS = AVAILABLE_SIMPLE_MODELS
+
     model_class = AVAILABLE_MODELS[model_name]
     if "." not in model_class:
-        model_class = f"lmms_eval.models.{model_name}.{model_class}"
+        model_class = f"lmms_eval.models.{model_type}.{model_name}.{model_class}"
 
     try:
         model_module, model_class = model_class.rsplit(".", 1)
@@ -99,5 +115,6 @@ def get_model(model_name):
     # Allow specifying other packages to import models from
     for plugin in os.environ["LMMS_EVAL_PLUGINS"].split(","):
         m = importlib.import_module(f"{plugin}.models")
+        # For plugin users, this will be replaced by chat template model later
         for model_name, model_class in getattr(m, "AVAILABLE_MODELS").items():
-            AVAILABLE_MODELS[model_name] = f"{plugin}.models.{model_name}.{model_class}"
+            AVAILABLE_SIMPLE_MODELS[model_name] = f"{plugin}.models.{model_name}.{model_class}"
@@ -0,0 +1,122 @@
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+from accelerate import Accelerator, DistributedType
+from accelerate.state import AcceleratorState
+from decord import VideoReader, cpu
+from tqdm import tqdm
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    LlavaForConditionalGeneration,
+    LlavaNextForConditionalGeneration,
+)
+
+from lmms_eval import utils
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+from lmms_eval.protocol import ChatMessages
+
+warnings.filterwarnings("ignore")
+
+from loguru import logger as eval_logger
+
+from lmms_eval.api.registry import register_model
+from lmms_eval.models.simple.llava_hf import LlavaHf as LlavaHfSimple
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+
+# Default chat for llava-hf/llava-1.5 models: https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0
+VICUNA_CHAT_TEMPLATE = "{% for message in messages %}{% if loop.index0 == 0 %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {{ message['content'] }} {% elif message['role'] == 'user' %}USER: {{ message['content'] }} {% else %} ASSISTANT: {{ message['content'] }}{{ eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"
+
+
+@register_model("llava_hf_chat")
+class LlavaHf(LlavaHfSimple):
+    is_simple = False
+
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        res = []
+
+        # A dummy collate here to sort by doc id
+        def _collate(x):
+            return x[2], x[2]
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True)
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1
+        pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding")
+        for chunk in chunks:
+            doc_to_messages, all_gen_kwargs, doc_id, task, split = zip(*chunk)
+            task = task[0]
+            split = split[0]
+            chat_messages = [doc_to_messages[0](self.task_dict[task][split][ids]) for ids in doc_id]
+            chat_messages: List[ChatMessages] = [ChatMessages(**{"messages": message}) for message in chat_messages]
+            visuals = []
+            videos = []
+            for messages in chat_messages:
+                visual, video, _ = messages.extract_media()
+                visuals.append(visual)
+                videos.append(video)
+            visuals = self.flatten(visuals)
+            videos = self.flatten(videos)
+            assert self.batch_size_per_gpu == 1, "Do not support batch_size_per_gpu > 1 for now"
+
+            # Apply chat template
+            messages = chat_messages[0].model_dump()["messages"]
+            text = self._image_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            if self.accelerator.is_main_process and doc_id[0] % 100 == 0:
+                eval_logger.debug(f"Prompt for doc ID {doc_id[0]}:\n\n{text}\n")
+
+            if len(videos) == 0:
+                videos = None
+            inputs = self._image_processor(images=visuals, videos=videos, text=text, return_tensors="pt").to(self._device, self.model.dtype)
+
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+
+            gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
+            if "max_new_tokens" not in gen_kwargs:
+                gen_kwargs["max_new_tokens"] = 1024
+            if "temperature" not in gen_kwargs:
+                gen_kwargs["temperature"] = 0
+            if "top_p" not in gen_kwargs:
+                gen_kwargs["top_p"] = None
+            if "num_beams" not in gen_kwargs:
+                gen_kwargs["num_beams"] = 1
+            try:
+                cont = self.model.generate(
+                    **inputs,
+                    do_sample=True if gen_kwargs["temperature"] > 0 else False,
+                    temperature=gen_kwargs["temperature"],
+                    top_p=gen_kwargs["top_p"],
+                    num_beams=gen_kwargs["num_beams"],
+                    max_new_tokens=gen_kwargs["max_new_tokens"],
+                    use_cache=self.use_cache,
+                    pad_token_id=self.eot_token_id,
+                    eos_token_id=self.eot_token_id,
+                )
+                cont = cont[:, inputs["input_ids"].shape[-1] :]
+            except Exception as e:
+                eval_logger.error(f"Error {e} in generating")
+                cont = ""
+            text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
+            if self.accelerator.is_main_process and doc_id[0] % 100 == 0:
+                eval_logger.debug(f"Generated text for doc ID {doc_id[0]}:\n\n{text_outputs}\n")
+
+            res.append(text_outputs)
+            self.cache_hook.add_partial("generate_until", (text, gen_kwargs), text_outputs)
+            pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+
+        pbar.close()
+        return res