-
Notifications
You must be signed in to change notification settings - Fork 42
Description
I ran the code from the lmms-eval repository (https://github.com/EvolvingLMMs-Lab/lmms-eval) after making the following corrections:
1. Imported IGNORE_INDEX from llava.constants in llava_qwen.py.
2. Pasted the generate_moviechat() function in llava_qwen.py from this repository.
3. Modified LLAVA_NeXT.llava to llava in llava_onevision_moviechat.py.
4. Added the generate_until_multi_round() function in llava_onevision_moviechat.py to address the error indicating that this abstract method was not implemented. While I declared it, I did not actually use this function.
After these adjustments, I tested the llava_onevision_moviechat model using the following script:
python3 -m accelerate.commands.launch \
--num_processes=8 \
-m lmms_eval \
--model llava_onevision_moviechat \
--tasks moviechat_global \
--batch_size 1 \
--log_samples \
--log_samples_suffix llava_onevision_moviechat \
--output_path ./logs/
However, the accuracy I achieved was only 39% as below. Could you kindly help me identify what might have gone wrong?
{
"results": {
"moviechat_global": {
"alias": "moviechat_global",
"gpt_eval_score,none": 2.7583781547372777,
"gpt_eval_score_stderr,none": "N/A",
"gpt_eval_acc,none": 0.39098055440628876,
"gpt_eval_acc_stderr,none": "N/A"
}
},
"group_subtasks": {
"moviechat_global": []
},
"configs": {
"moviechat_global": {
"task": "moviechat_global",
"dataset_path": "Enxin/lmms_MovieChat_test",
"dataset_kwargs": {
"token": true
},
"test_split": "test",
"full_docs": false,
"process_results_use_image": false,
"doc_to_visual": "<function moviechat_doc_to_visual at 0x7f8ab8418af0>",
"doc_to_text": "<function moviechat_doc_to_text at 0x7f8ab8431670>",
"doc_to_target": "<function moviechat_doc_to_answer at 0x7f8ab8431f70>",
"process_results": "<function moviechat_process_results_generic at 0x7f8ab843baf0>",
"description": "",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"num_fewshot": 0,
"metric_list": [
{
"metric": "gpt_eval_score",
"aggregation": "<function moviechat_aggregate_score at 0x7f8ab8442430>",
"higher_is_better": true
},
{
"metric": "gpt_eval_acc",
"aggregation": "<function moviechat_aggregate_acc at 0x7f8ab8442d30>",
"higher_is_better": true
}
],
"output_type": "generate_until",
"generation_kwargs": {
"until": [
"\n\n"
],
"do_sample": false
},
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0,
"gpt_eval_model_name": "gpt-3.5-turbo-0125"
},
"lmms_eval_specific_kwargs": {
"default": {
"pre_prompt": "You are able to understand the visual content that the user provides.Follow the instructions carefully and explain your answers in detail.",
"post_prompt": ""
},
"pre_prompt": "You are able to understand the visual content that the user provides.Follow the instructions carefully and explain your answers in detail.",
"post_prompt": ""
}
}
},
"versions": {
"moviechat_global": 0.0
},
"n-shot": {
"moviechat_global": 0
},
"higher_is_better": {
"moviechat_global": {
"gpt_eval_score": true,
"gpt_eval_acc": true
}
},
"n-samples": {
"moviechat_global": {
"original": 2417,
"effective": 2417
}
},
"config": {
"model": "llava_onevision_moviechat",
"model_args": "",
"batch_size": "1",
"batch_sizes": [],
"device": null,
"use_cache": null,
"limit": null,
"bootstrap_iters": 100000,
"gen_kwargs": "",
"random_seed": 0,
"numpy_seed": 1234,
"torch_seed": 1234,
"fewshot_seed": 1234
},
"git_hash": "d2056e6",
"date": "20241127_134436",
"task_hashes": {
"moviechat_global": "51d9d796ea5bc78838d989e9f7802a1cdf68efab207ef3ef66d59a9993836fef"
},
"model_source": "llava_onevision_moviechat",
"model_name": "",
"model_name_sanitized": "",
"system_instruction": null,
"system_instruction_sha": null,
"fewshot_as_multiturn": false,
"chat_template": null,
"chat_template_sha": null,
"start_time": 2483129.288830893,
"end_time": 2503653.673547,
"total_evaluation_time_seconds": "20524.384716107044"
}