-
Notifications
You must be signed in to change notification settings - Fork 208
Description
task_cfg = TaskConfig(
# model='Qwen3-32B',
# api_url='http://127.0.0.1:8996/v1/chat/completions',
model=args.model,
api_url='http://127.0.0.1:' + str(args.port) + '/v1/chat/completions',
eval_type=args.eval_type,
datasets=[
f'{args.dataset}'
# 'gsm8k','iquiz', 'aime24','mmlu','super_gpqa', 'bbh'
],
dataset_args={
'aime24': {
'dataset_id': 'HuggingFaceH4/aime_2024',
'filters': {'remove_until': ''} # Filter out the content of thinking
},
'iquiz': {
# 'local_path': "/usr/local/lib/python3.12/site-packages/evalscope/custom_eval/",
# 'subset_list':['IQ', 'EQ'],
'dataset_id': 'AI-ModelScope/IQuiz',
'filters': {'remove_until': ''} # Filter out the content of thinking
},
'mmlu': {
# 'dataset_id': 'modelscope/mmlu',
'dataset_id': 'cais/mmlu', #1.0.2版本不需要dataset_id, 默认的是cais/mmlu
'filters': {'remove_until': ''},
# 'subset_list':mmlu_subset_list
},
'super_gpqa': {
'dataset_id': 'm-a-p/SuperGPQA',
'filters': {'remove_until': ''} # Filter out the content of thinking
},
'bbh': {
'dataset_id': 'modelscope/bbh',
'filters': {'remove_until': ''} # Filter out the content of thinking
},
'gsm8k': {
'dataset_id': 'modelscope/gsm8k',
'filters': {'remove_until': ''} # Filter out the content of thinking
},
},
eval_batch_size=args.batch_size,
generation_config={
'max_tokens': 30000, # Max number of generated tokens, suggested to set a large value to avoid output truncation
'temperature': args.temperature, # Sampling temperature (recommended value per Qwen report)
'top_p': args.top_p, # top-p sampling (recommended value per Qwen report)
'top_k': args.top_k, # top-k sampling (recommended value per Qwen report)
'n': 1, # Number of replies generated per request
},
timeout=70000, # Timeout
stream=False, # Use streaming output
limit=args.sample_size,
ignore_errors=True# Set to 100 samples for testing
)
run_task(task_cfg=task_cfg)
上面是评测任务的主要部分
执行命令是: python3 eval_public2.py --model Qwen3-30B-A3B --dataset iquiz --sample_size 120 --eval_type server --port 8000
抛出的异常是:
19:23:26,540 - numexpr.utils - INFO: Note: NumExpr detected 44 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
2025-10-16 19:23:26,540 - numexpr.utils - INFO: NumExpr defaulting to 16 threads.
In evalscope Current temperature: 0.6
In evalscope Current top_p: 0.95
In evalscope Current top_k: 20
2025-10-16 19:23:26 - evalscope - WARNING: Deprecated: The timeout
parameter is deprecated and will be removed in v1.1.0. Use generation_config.timeout
instead.
2025-10-16 19:23:26 - evalscope - WARNING: Deprecated: The stream
parameter is deprecated and will be removed in v1.1.0. Use generation_config.stream
instead.
2025-10-16 19:23:26 - evalscope - INFO: Args: Task config is provided with TaskConfig type.
2025-10-16 19:23:27 - evalscope - INFO: Creating model Qwen3-30B-A3B with eval_type=server base_url=http://127.0.0.1:8000/v1/chat/completions, config={'timeout': 70000, 'batch_size': 10, 'stream': False, 'max_tokens': 30000, 'top_p': 0.95, 'temperature': 0.6, 'top_k': 20, 'n': 1}, model_args={}
2025-10-16 19:23:27 - evalscope - WARNING: server in model_apis.py has been deprecated since version 1.0.0. and will be removed in version 1.1.0. Use openai_api instead
2025-10-16 19:23:27 - evalscope - INFO: Dump task config to ./outputs/20251016_192326/configs/task_config_a30b61.yaml
2025-10-16 19:23:27 - evalscope - INFO: {
"model": "Qwen3-30B-A3B",
"model_id": "Qwen3-30B-A3B",
"model_args": {},
"model_task": "text_generation",
"chat_template": null,
"datasets": [
"iquiz"
],
"dataset_args": {
"aime24": {
"dataset_id": "HuggingFaceH4/aime_2024",
"filters": {
"remove_until": ""
}
},
"iquiz": {
"name": "iquiz",
"dataset_id": "AI-ModelScope/IQuiz",
"output_types": [
"generation"
],
"subset_list": [
"IQ",
"EQ"
],
"default_subset": "default",
"few_shot_num": 0,
"few_shot_random": false,
"train_split": null,
"eval_split": "test",
"prompt_template": "回答下面的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:"LETTER"(不带引号),其中 LETTER 是 {letters} 中的一个。请在回答前进行一步步思考。\n\n问题:{question}\n选项:\n{choices}\n",
"few_shot_prompt_template": null,
"system_prompt": null,
"query_template": null,
"pretty_name": "IQuiz",
"description": "IQuiz is a benchmark for evaluating AI models on IQ and EQ questions. It consists of multiple-choice questions where the model must select the correct answer and provide an explanation.",
"tags": [
"Knowledge",
"MCQ",
"Chinese"
],
"filters": {
"remove_until": ""
},
"metric_list": [
"acc"
],
"aggregation": "mean",
"shuffle": false,
"shuffle_choices": false,
"review_timeout": null,
"extra_params": {}
},
"mmlu": {
"dataset_id": "cais/mmlu",
"filters": {
"remove_until": ""
}
},
"super_gpqa": {
"dataset_id": "m-a-p/SuperGPQA",
"filters": {
"remove_until": ""
}
},
"bbh": {
"dataset_id": "modelscope/bbh",
"filters": {
"remove_until": ""
}
},
"gsm8k": {
"dataset_id": "modelscope/gsm8k",
"filters": {
"remove_until": ""
}
}
},
"dataset_dir": "/root/.cache/modelscope/hub/datasets",
"dataset_hub": "modelscope",
"repeats": 1,
"generation_config": {
"timeout": 70000,
"batch_size": 10,
"stream": false,
"max_tokens": 30000,
"top_p": 0.95,
"temperature": 0.6,
"top_k": 20,
"n": 1
},
"eval_type": "server",
"eval_backend": "Native",
"eval_config": null,
"limit": 120,
"eval_batch_size": 10,
"use_cache": null,
"rerun_review": false,
"work_dir": "./outputs/20251016_192326",
"ignore_errors": true,
"debug": false,
"seed": 42,
"api_url": "http://127.0.0.1:8000/v1/chat/completions",
"timeout": 70000,
"stream": false,
"judge_strategy": "auto",
"judge_worker_num": 1,
"judge_model_args": {},
"analysis_report": false,
"use_sandbox": false,
"sandbox_type": "docker",
"sandbox_manager_config": {},
"sandbox_config": {}
}
2025-10-16 19:23:27 - evalscope - INFO: Loading dataset AI-ModelScope/IQuiz from modelscope > subset: IQ > split: test ...
Downloading [README.md]: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.33k/3.33k [00:00<00:00, 32.3MB/s]
2025-10-16 19:23:36,177 - modelscope - ERROR - >> Error loading AI-ModelScope/IQuiz: Unable to find 'hf://datasets/AI-ModelScope/IQuiz@master/IQ.jsonl' with any supported extension ['.csv', '.tsv', '.json', '.jsonl', '.ndjson', '.parquet', '.geoparquet', '.gpq', '.arrow', '.txt', '.tar', '.xml', '.blp', '.bmp', '.dib', '.bufr', '.cur', '.pcx', '.dcx', '.dds', '.ps', '.eps', '.fit', '.fits', '.fli', '.flc', '.ftc', '.ftu', '.gbr', '.gif', '.grib', '.png', '.apng', '.jp2', '.j2k', '.jpc', '.jpf', '.jpx', '.j2c', '.icns', '.ico', '.im', '.iim', '.tif', '.tiff', '.jfif', '.jpe', '.jpg', '.jpeg', '.mpg', '.mpeg', '.msp', '.pcd', '.pxr', '.pbm', '.pgm', '.ppm', '.pnm', '.psd', '.bw', '.rgb', '.rgba', '.sgi', '.ras', '.tga', '.icb', '.vda', '.vst', '.webp', '.wmf', '.emf', '.xbm', '.xpm', '.BLP', '.BMP', '.DIB', '.BUFR', '.CUR', '.PCX', '.DCX', '.DDS', '.PS', '.EPS', '.FIT', '.FITS', '.FLI', '.FLC', '.FTC', '.FTU', '.GBR', '.GIF', '.GRIB', '.PNG', '.APNG', '.JP2', '.J2K', '.JPC', '.JPF', '.JPX', '.J2C', '.ICNS', '.ICO', '.IM', '.IIM', '.TIF', '.TIFF', '.JFIF', '.JPE', '.JPG', '.JPEG', '.MPG', '.MPEG', '.MSP', '.PCD', '.PXR', '.PBM', '.PGM', '.PPM', '.PNM', '.PSD', '.BW', '.RGB', '.RGBA', '.SGI', '.RAS', '.TGA', '.ICB', '.VDA', '.VST', '.WEBP', '.WMF', '.EMF', '.XBM', '.XPM', '.aiff', '.au', '.avr', '.caf', '.flac', '.htk', '.svx', '.mat4', '.mat5', '.mpc2k', '.ogg', '.paf', '.pvf', '.raw', '.rf64', '.sd2', '.sds', '.ircam', '.voc', '.w64', '.wav', '.nist', '.wavex', '.wve', '.xi', '.mp3', '.opus', '.AIFF', '.AU', '.AVR', '.CAF', '.FLAC', '.HTK', '.SVX', '.MAT4', '.MAT5', '.MPC2K', '.OGG', '.PAF', '.PVF', '.RAW', '.RF64', '.SD2', '.SDS', '.IRCAM', '.VOC', '.W64', '.WAV', '.NIST', '.WAVEX', '.WVE', '.XI', '.MP3', '.OPUS', '.mkv', '.mp4', '.avi', '.mov', '.MKV', '.MP4', '.AVI', '.MOV', '.pdf', '.PDF', '.zip']
Traceback (most recent call last):
File "/mnt/workspace/E2Eeval/eval_public2.py", line 76, in
run_task(task_cfg=task_cfg)
File "/usr/local/lib/python3.12/site-packages/evalscope/run.py", line 28, in run_task
return run_single_task(task_cfg, run_time)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/evalscope/run.py", line 41, in run_single_task
result = evaluate_model(task_cfg, outputs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/evalscope/run.py", line 144, in evaluate_model
res_dict = evaluator.eval()
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/evalscope/evaluator/evaluator.py", line 94, in eval
dataset_dict = self.benchmark.load_dataset()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/evalscope/api/benchmark/adapters/default_data_adapter.py", line 62, in load_dataset
self.test_dataset, self.fewshot_dataset = self.load()
^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/evalscope/api/benchmark/adapters/default_data_adapter.py", line 81, in load
return self.load_from_remote()
^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/evalscope/api/benchmark/adapters/default_data_adapter.py", line 89, in load_from_remote
test_dataset = self.load_subsets(test_load_func)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/evalscope/api/benchmark/adapters/default_data_adapter.py", line 215, in load_subsets
subset_data = load_func(subset)
^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/evalscope/api/benchmark/adapters/default_data_adapter.py", line 248, in load_subset
dataset = loader.load()
^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/evalscope/api/dataset/loader.py", line 88, in load
dataset = MsDataset.load(
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/modelscope/msdatasets/ms_dataset.py", line 308, in load
with load_dataset_with_ctx(
File "/usr/local/lib/python3.12/contextlib.py", line 137, in enter
return next(self.gen)
^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/modelscope/msdatasets/utils/hf_datasets_util.py", line 1475, in load_dataset_with_ctx
dataset_res = DatasetsWrapperHF.load_dataset(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/modelscope/msdatasets/utils/hf_datasets_util.py", line 991, in load_dataset
builder_instance = DatasetsWrapperHF.load_dataset_builder(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/modelscope/msdatasets/utils/hf_datasets_util.py", line 1125, in load_dataset_builder
dataset_module = DatasetsWrapperHF.dataset_module_factory(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/modelscope/msdatasets/utils/hf_datasets_util.py", line 1425, in dataset_module_factory
raise FileNotFoundError(
FileNotFoundError: Couldn't find a dataset script at /mnt/workspace/E2Eeval/AI-ModelScope/IQuiz/IQuiz.py or any data file in the same directory. Couldn't find 'AI-ModelScope/IQuiz' on the Hugging Face Hub either: FileNotFoundError: Unable to find 'hf://datasets/AI-ModelScope/IQuiz@master/IQ.jsonl' with any supported extension ['.csv', '.tsv', '.json', '.jsonl', '.ndjson', '.parquet', '.geoparquet', '.gpq', '.arrow', '.txt', '.tar', '.xml', '.blp', '.bmp', '.dib', '.bufr', '.cur', '.pcx', '.dcx', '.dds', '.ps', '.eps', '.fit', '.fits', '.fli', '.flc', '.ftc', '.ftu', '.gbr', '.gif', '.grib', '.png', '.apng', '.jp2', '.j2k', '.jpc', '.jpf', '.jpx', '.j2c', '.icns', '.ico', '.im', '.iim', '.tif', '.tiff', '.jfif', '.jpe', '.jpg', '.jpeg', '.mpg', '.mpeg', '.msp', '.pcd', '.pxr', '.pbm', '.pgm', '.ppm', '.pnm', '.psd', '.bw', '.rgb', '.rgba', '.sgi', '.ras', '.tga', '.icb', '.vda', '.vst', '.webp', '.wmf', '.emf', '.xbm', '.xpm', '.BLP', '.BMP', '.DIB', '.BUFR', '.CUR', '.PCX', '.DCX', '.DDS', '.PS', '.EPS', '.FIT', '.FITS', '.FLI', '.FLC', '.FTC', '.FTU', '.GBR', '.GIF', '.GRIB', '.PNG', '.APNG', '.JP2', '.J2K', '.JPC', '.JPF', '.JPX', '.J2C', '.ICNS', '.ICO', '.IM', '.IIM', '.TIF', '.TIFF', '.JFIF', '.JPE', '.JPG', '.JPEG', '.MPG', '.MPEG', '.MSP', '.PCD', '.PXR', '.PBM', '.PGM', '.PPM', '.PNM', '.PSD', '.BW', '.RGB', '.RGBA', '.SGI', '.RAS', '.TGA', '.ICB', '.VDA', '.VST', '.WEBP', '.WMF', '.EMF', '.XBM', '.XPM', '.aiff', '.au', '.avr', '.caf', '.flac', '.htk', '.svx', '.mat4', '.mat5', '.mpc2k', '.ogg', '.paf', '.pvf', '.raw', '.rf64', '.sd2', '.sds', '.ircam', '.voc', '.w64', '.wav', '.nist', '.wavex', '.wve', '.xi', '.mp3', '.opus', '.AIFF', '.AU', '.AVR', '.CAF', '.FLAC', '.HTK', '.SVX', '.MAT4', '.MAT5', '.MPC2K', '.OGG', '.PAF', '.PVF', '.RAW', '.RF64', '.SD2', '.SDS', '.IRCAM', '.VOC', '.W64', '.WAV', '.NIST', '.WAVEX', '.WVE', '.XI', '.MP3', '.OPUS', '.mkv', '.mp4', '.avi', '.mov', '.MKV', '.MP4', '.AVI', '.MOV', '.pdf', '.PDF', '.zip']