few shot 评测基准

您的工作太优秀了，但是我有个问题，就是有一些基准是使用few shot的，例如，mmlu，mmlu-pro 这类基准，我使用本地下载的数据集，那我怎么确定我是否使用上了few shot呢？期待您的回复。
这是我输出的log文件。
2025-10-15 08:07:11,449 - evalscope - INFO - Dump task config to ./outputs/20251015_080701/configs/task_config_283902.yaml
2025-10-15 08:07:11,512 - evalscope - INFO - {
    "model": "/20251001-SFT-stage-1-3B/checkpoint-120000",
    "model_id": "checkpoint-120000",
    "model_args": {},
    "model_task": "text_generation",
    "template_type": null,
    "chat_template": null,
    "datasets": [
        "mmlu_pro"
    ],
    "dataset_args": {
        "mmlu_pro": {
            "name": "mmlu_pro",
            "dataset_id": "/data0/eval/MMLU-Pro-main/MMLU-Pro-data/data_my",
            "model_adapter": "generation",
            "output_types": [
                "multiple_choice_logits",
                "generation"
            ],
            "subset_list": [
                "computer science",
                "math",
                "chemistry",
                "engineering",
                "law",
                "biology",
                "health",
                "physics",
                "business",
                "philosophy",
                "economics",
                "other",
                "psychology",
                "history"
            ],
            "metric_list": [
                "AverageAccuracy"
            ],
            "few_shot_num": 5,
            "few_shot_random": false,
            "train_split": "validation",
            "eval_split": "test",
            "prompt_template": "The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}",
            "system_prompt": null,
            "query_template": null,
            "pretty_name": "MMLU-Pro",
            "description": "MMLU-Pro is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.",
            "tags": [
                "MCQ",
                "Knowledge"
            ],
            "filters": {
                "remove_until": "</think>"
            },
            "extra_params": {}
        }
    },
    "dataset_dir": "/root/.cache/modelscope/hub/datasets",
    "dataset_hub": "modelscope",
    "generation_config": {
        "max_tokens": 4096,
        "temperature": 0,
        "n": 1
    },
    "eval_type": "service",
    "eval_backend": "Native",
    "eval_config": null,
    "stage": "all",
    "limit": null,
    "eval_batch_size": 64,
    "mem_cache": false,
    "use_cache": null,
    "work_dir": "./outputs/20251015_080701",
    "outputs": null,
    "ignore_errors": false,
    "debug": false,
    "dry_run": false,
    "seed": 42,
    "api_url": "http://localhost:8801/v1/chat/completions",
    "api_key": "EMPTY",
    "timeout": 60000,
    "stream": true,
    "judge_strategy": "auto",
    "judge_worker_num": 1,
    "judge_model_args": {},
    "analysis_report": false
}
2025-10-15 08:07:11,512 - evalscope - INFO - Start evaluating on dataset /data0/train-aitech/projects/yingwei5/eval/MMLU-Pro-main/MMLU-Pro-data/data_my
2025-10-15 08:07:11,540 - evalscope - INFO - Loading dataset from local disk: /data0/train-aitech/projects/yingwei5/eval/MMLU-Pro-main/MMLU-Pro-data/data_my
2025-10-15 08:07:11,983 - evalscope - INFO - Loading dataset: dataset_name: /data0/train-aitech/projects/yingwei5/eval/MMLU-Pro-main/MMLU-Pro-data/data_my > subsets: ['default']
2025-10-15 08:07:15,212 - evalscope - INFO - Use settings: > few_shot_num: 5, > few_shot_split: validation, > target_eval_split: test


我怎么确定使用了few shot呢？

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

few shot 评测基准 #874

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

few shot 评测基准 #874

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions