-
Notifications
You must be signed in to change notification settings - Fork 208
Description
您的工作太优秀了,但是我有个问题,就是有一些基准是使用few shot的,例如,mmlu,mmlu-pro 这类基准,我使用本地下载的数据集,那我怎么确定我是否使用上了few shot呢?期待您的回复。
这是我输出的log文件。
2025-10-15 08:07:11,449 - evalscope - INFO - Dump task config to ./outputs/20251015_080701/configs/task_config_283902.yaml
2025-10-15 08:07:11,512 - evalscope - INFO - {
"model": "/20251001-SFT-stage-1-3B/checkpoint-120000",
"model_id": "checkpoint-120000",
"model_args": {},
"model_task": "text_generation",
"template_type": null,
"chat_template": null,
"datasets": [
"mmlu_pro"
],
"dataset_args": {
"mmlu_pro": {
"name": "mmlu_pro",
"dataset_id": "/data0/eval/MMLU-Pro-main/MMLU-Pro-data/data_my",
"model_adapter": "generation",
"output_types": [
"multiple_choice_logits",
"generation"
],
"subset_list": [
"computer science",
"math",
"chemistry",
"engineering",
"law",
"biology",
"health",
"physics",
"business",
"philosophy",
"economics",
"other",
"psychology",
"history"
],
"metric_list": [
"AverageAccuracy"
],
"few_shot_num": 5,
"few_shot_random": false,
"train_split": "validation",
"eval_split": "test",
"prompt_template": "The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with "the answer is (X)" where X is the correct letter choice.\n{query}",
"system_prompt": null,
"query_template": null,
"pretty_name": "MMLU-Pro",
"description": "MMLU-Pro is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.",
"tags": [
"MCQ",
"Knowledge"
],
"filters": {
"remove_until": ""
},
"extra_params": {}
}
},
"dataset_dir": "/root/.cache/modelscope/hub/datasets",
"dataset_hub": "modelscope",
"generation_config": {
"max_tokens": 4096,
"temperature": 0,
"n": 1
},
"eval_type": "service",
"eval_backend": "Native",
"eval_config": null,
"stage": "all",
"limit": null,
"eval_batch_size": 64,
"mem_cache": false,
"use_cache": null,
"work_dir": "./outputs/20251015_080701",
"outputs": null,
"ignore_errors": false,
"debug": false,
"dry_run": false,
"seed": 42,
"api_url": "http://localhost:8801/v1/chat/completions",
"api_key": "EMPTY",
"timeout": 60000,
"stream": true,
"judge_strategy": "auto",
"judge_worker_num": 1,
"judge_model_args": {},
"analysis_report": false
}
2025-10-15 08:07:11,512 - evalscope - INFO - Start evaluating on dataset /data0/train-aitech/projects/yingwei5/eval/MMLU-Pro-main/MMLU-Pro-data/data_my
2025-10-15 08:07:11,540 - evalscope - INFO - Loading dataset from local disk: /data0/train-aitech/projects/yingwei5/eval/MMLU-Pro-main/MMLU-Pro-data/data_my
2025-10-15 08:07:11,983 - evalscope - INFO - Loading dataset: dataset_name: /data0/train-aitech/projects/yingwei5/eval/MMLU-Pro-main/MMLU-Pro-data/data_my > subsets: ['default']
2025-10-15 08:07:15,212 - evalscope - INFO - Use settings: > few_shot_num: 5, > few_shot_split: validation, > target_eval_split: test
我怎么确定使用了few shot呢?