From 134f61fa85f669287ee3a9ffb9bcfc6ed5bbeec6 Mon Sep 17 00:00:00 2001 From: null <3213204+drcege@users.noreply.github.com> Date: Mon, 21 Oct 2024 19:17:39 +0800 Subject: [PATCH 01/23] align sft formats --- configs/config_all.yaml | 1 - data_juicer/ops/base_op.py | 5 ++ data_juicer/ops/mapper/extract_qa_mapper.py | 75 +++++++++---------- .../ops/mapper/generate_instruction_mapper.py | 22 ++++-- .../ops/mapper/optimize_instruction_mapper.py | 4 +- tests/ops/mapper/test_extract_qa_mapper.py | 17 ++--- .../test_generate_instruction_mapper.py | 11 +-- .../test_optimize_instruction_mapper.py | 10 +-- 8 files changed, 71 insertions(+), 74 deletions(-) diff --git a/configs/config_all.yaml b/configs/config_all.yaml index a28861c773..c5b4e803a4 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -64,7 +64,6 @@ process: - extract_qa_mapper: # mapper to extract question and answer pair from text. hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa' # model name on huggingface to extract question and answer pair. pattern: null # regular expression pattern to search for within text. - qa_format: 'chatml' # Output format of question and answer pair. enable_vllm: true # Whether to use vllm for inference acceleration. tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism. max_model_len: null # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config. diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index 6eecab75f2..10e79f6e88 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -139,6 +139,11 @@ def __init__(self, *args, **kwargs): self.image_key = kwargs.get('image_key', 'images') self.audio_key = kwargs.get('audio_key', 'audios') self.video_key = kwargs.get('video_key', 'videos') + + self.query_key = kwargs.get('query_key', 'query') + self.response_key = kwargs.get('response_key', 'response') + self.history_key = kwargs.get('history_key', 'history') + self.batch_size = kwargs.get('batch_size', 1000) # whether the model can be accelerated using cuda diff --git a/data_juicer/ops/mapper/extract_qa_mapper.py b/data_juicer/ops/mapper/extract_qa_mapper.py index 23d99c1af3..3a9e498ff6 100644 --- a/data_juicer/ops/mapper/extract_qa_mapper.py +++ b/data_juicer/ops/mapper/extract_qa_mapper.py @@ -1,4 +1,3 @@ -import json import re from typing import Dict, Optional @@ -38,12 +37,12 @@ class ExtractQAMapper(Mapper): """ _accelerator = 'cuda' + _batched_op = True def __init__(self, hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', trust_remote_code: bool = False, pattern: Optional[str] = None, - qa_format: str = 'chatml', enable_vllm: bool = True, tensor_parallel_size: Optional[int] = None, max_model_len: Optional[int] = None, @@ -56,7 +55,6 @@ def __init__(self, :param hf_model: Hugginface model id. :param trust_remote_code: passed to transformers :param pattern: regular expression pattern to search for within text. - :param qa_format: Output format of question and answer pair. :param enable_vllm: Whether to use vllm for inference acceleration. :param tensor_parallel_size: It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor @@ -93,7 +91,6 @@ def __init__(self, else: self.pattern = pattern - self.qa_format = qa_format self.enable_vllm = enable_vllm if enable_vllm: @@ -133,41 +130,39 @@ def _extract_qa(self, output): return qa_list - def process_single(self, sample, rank=None): + def process_batched(self, samples, rank=None): model, processor = get_model(self.model_key, rank, self.use_cuda()) - if self.enable_vllm: - response = model.generate([sample[self.text_key]], - self.sampling_params) - output = response[0].outputs[0].text - else: - inputs = processor(sample[self.text_key], - return_tensors='pt').to(model.device) - response = model.generate(**inputs, **self.sampling_params) - output = processor.decode(response.cpu()[0], - skip_special_tokens=True) - - qa_list = self._extract_qa(output) - - if not len(qa_list): - logger.info( - 'No question and answer data was extracted from this sample!') - - dialogue_data = [] - if self.qa_format == 'chatml': - for qa in qa_list: - dialogue_data.append({ - 'messages': [{ - 'role': 'user', - 'content': qa[0] - }, { - 'role': 'assistant', - 'content': qa[1] - }] - }) - else: - raise ValueError(f'Not support {self.qa_format}!') - - sample[self.text_key] = json.dumps(dialogue_data, ensure_ascii=False) - - return sample + keys = samples.keys() + first_key = next(iter(keys)) + num_samples = len(samples[first_key]) + out_samples = { + key: [] + for key in keys | {self.query_key, self.response_key} + } + for i in range(num_samples): + sample = {key: samples[key][i] for key in keys} + if self.enable_vllm: + response = model.generate([sample[self.text_key]], + self.sampling_params) + output = response[0].outputs[0].text + else: + inputs = processor(sample[self.text_key], + return_tensors='pt').to(model.device) + response = model.generate(**inputs, **self.sampling_params) + output = processor.decode(response.cpu()[0], + skip_special_tokens=True) + + qa_list = self._extract_qa(output) + + if len(qa_list) > 0: + for q, a in qa_list: + for k, v in sample.items(): + out_samples[k].append(v) + out_samples[self.query_key].append(q) + out_samples[self.response_key].append(a) + else: + logger.info( + 'No question and answer was extracted from this sample!') + + return out_samples diff --git a/data_juicer/ops/mapper/generate_instruction_mapper.py b/data_juicer/ops/mapper/generate_instruction_mapper.py index 91547b2455..6903e102df 100644 --- a/data_juicer/ops/mapper/generate_instruction_mapper.py +++ b/data_juicer/ops/mapper/generate_instruction_mapper.py @@ -217,11 +217,12 @@ def parse_chatml_str(self, input_str): return qa_pairs def parse_response(self, response_str): + logger.debug(response_str) pattern = self.qa_extraction_pattern matches = re.findall(pattern, response_str, re.DOTALL) response_str = '' out_qa_pairs = [] - for i, match in enumerate(matches): + for match in matches: question, answer = match question = question.strip() answer = answer.strip() @@ -264,11 +265,14 @@ def process_single(self, sample=None, rank=None): output_ids = output_ids[:, inputs.data['input_ids'].shape[1]:] response_str = processor.decode(output_ids.cpu()[0], skip_special_tokens=True) - message_list = [] out_qa_pairs, response_str = self.parse_response(response_str) if not response_str: - return {self.text_key: json.dumps({'messages': message_list})} + return { + self.query_key: '', + self.response_key: '', + self.history_key: [] + } if self.similarity_type == 'rouge_l': sim_score = self.max_rouge_l_score(response_str, @@ -278,13 +282,15 @@ def process_single(self, sample=None, rank=None): f'Not support similarity type "{self.similarity_type}"!') if sim_score <= self.similarity_threshold: - for question, answer in out_qa_pairs: - message_list.append({'role': 'user', 'content': question}) - message_list.append({'role': 'assistant', 'content': answer}) + query, response = out_qa_pairs[-1] + history = out_qa_pairs[:-1] else: + query = response = '' + history = [] logger.info('Filter this generated sample due to similarity.') return { - self.text_key: - json.dumps({'messages': message_list}, ensure_ascii=False) + self.query_key: query, + self.response_key: response, + self.history_key: history } diff --git a/data_juicer/ops/mapper/optimize_instruction_mapper.py b/data_juicer/ops/mapper/optimize_instruction_mapper.py index 34e2affbf7..6348d30555 100644 --- a/data_juicer/ops/mapper/optimize_instruction_mapper.py +++ b/data_juicer/ops/mapper/optimize_instruction_mapper.py @@ -101,7 +101,7 @@ def process_single(self, sample=None, rank=None): 'content': self.system_prompt }, { 'role': 'user', - 'content': sample[self.text_key] + 'content': sample[self.query_key] }] input_prompt = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True) @@ -118,6 +118,6 @@ def process_single(self, sample=None, rank=None): output = processor.decode(response.cpu()[0], skip_special_tokens=True) - sample[self.text_key] = output + sample[self.query_key] = output return sample diff --git a/tests/ops/mapper/test_extract_qa_mapper.py b/tests/ops/mapper/test_extract_qa_mapper.py index 648996a9f7..e3dd8fa61e 100644 --- a/tests/ops/mapper/test_extract_qa_mapper.py +++ b/tests/ops/mapper/test_extract_qa_mapper.py @@ -1,5 +1,6 @@ import unittest -import json +from loguru import logger +from data_juicer.core.data import NestedDataset as Dataset from data_juicer.ops.mapper.extract_qa_mapper import ExtractQAMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) @@ -13,20 +14,14 @@ class ExtractQAMapperTest(DataJuicerTestCaseBase): def _run_extract_qa(self, samples, enable_vllm=False, sampling_params={}, **kwargs): op = ExtractQAMapper( hf_model='alibaba-pai/pai-qwen1_5-7b-doc2qa', - qa_format='chatml', enable_vllm=enable_vllm, sampling_params=sampling_params, **kwargs ) - for sample in samples: - result = op.process(sample) - out_text = json.loads(result[self.text_key]) - print(f'Output sample: {out_text}') - - # test one output qa sample - qa_sample = out_text[0] - self.assertIn('role', qa_sample['messages'][0]) - self.assertIn('content', qa_sample['messages'][0]) + dataset = Dataset.from_list(samples) + dataset = dataset.map(op.process, batch_size=2) + for row in dataset: + logger.info(row) def test_extract_qa(self): samples = [ diff --git a/tests/ops/mapper/test_generate_instruction_mapper.py b/tests/ops/mapper/test_generate_instruction_mapper.py index 0bd7a10998..34b1f9f3cd 100644 --- a/tests/ops/mapper/test_generate_instruction_mapper.py +++ b/tests/ops/mapper/test_generate_instruction_mapper.py @@ -1,5 +1,6 @@ import unittest import json +from loguru import logger from data_juicer.ops.mapper.generate_instruction_mapper import GenerateInstructionMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) @@ -24,13 +25,9 @@ def _run_generate_instruction(self, enable_vllm=False): dataset = dataset.map(op.process) - for item in dataset: - out_sample = json.loads(item[self.text_key]) - print(f'Output sample: {out_sample}') - # test one output qa sample - self.assertIn('role', out_sample['messages'][0]) - self.assertIn('content', out_sample['messages'][0]) - + for row in dataset: + logger.info(row) + def test_generate_instruction(self): self._run_generate_instruction() diff --git a/tests/ops/mapper/test_optimize_instruction_mapper.py b/tests/ops/mapper/test_optimize_instruction_mapper.py index 7c7b58b4c0..ebd1603d0b 100644 --- a/tests/ops/mapper/test_optimize_instruction_mapper.py +++ b/tests/ops/mapper/test_optimize_instruction_mapper.py @@ -1,4 +1,5 @@ import unittest +from loguru import logger from data_juicer.ops.mapper.optimize_instruction_mapper import OptimizeInstructionMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) @@ -7,8 +8,7 @@ # These tests have been tested locally. @SKIPPED_TESTS.register_module() class OptimizeInstructionMapperTest(DataJuicerTestCaseBase): - - text_key = 'text' + query_key = 'query' def _run_optimize_instruction(self, enable_vllm=False): op = OptimizeInstructionMapper( @@ -17,13 +17,13 @@ def _run_optimize_instruction(self, enable_vllm=False): ) samples = [ - {self.text_key: '鱼香肉丝怎么做?'} + {self.query_key: '鱼香肉丝怎么做?'} ] for sample in samples: result = op.process(sample) - print(f'Output results: {result}') - self.assertIn(self.text_key, result) + logger.info(f'Output results: {result}') + self.assertIn(self.query_key, result) def test_optimize_instruction(self): self._run_optimize_instruction() From a68d925b73f2a773fbfac268dfd0e741e6a39c95 Mon Sep 17 00:00:00 2001 From: null <3213204+drcege@users.noreply.github.com> Date: Tue, 22 Oct 2024 11:06:17 +0800 Subject: [PATCH 02/23] fix test --- .../ops/filter/language_id_score_filter.py | 2 +- tests/config/test_config_funcs.py | 25 +++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py index a3e9bd3c94..c00bf8413b 100644 --- a/data_juicer/ops/filter/language_id_score_filter.py +++ b/data_juicer/ops/filter/language_id_score_filter.py @@ -33,7 +33,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['fasttext', 'fasttext-wheel']) + AUTOINSTALL.check(['fasttext-wheel']) if not lang: # lang is [], '' or None self.lang = None diff --git a/tests/config/test_config_funcs.py b/tests/config/test_config_funcs.py index c024ceb0fe..1cb7c4463f 100644 --- a/tests/config/test_config_funcs.py +++ b/tests/config/test_config_funcs.py @@ -45,6 +45,9 @@ def test_yaml_cfg_file(self): 'image_key': 'images', 'audio_key': 'audios', 'video_key': 'videos', + 'query_key': 'query', + 'response_key': 'response', + 'history_key': 'history', 'accelerator': None, 'num_proc': 4, 'cpu_required': 1, @@ -62,6 +65,9 @@ def test_yaml_cfg_file(self): 'image_key': 'images', 'audio_key': 'audios', 'video_key': 'videos', + 'query_key': 'query', + 'response_key': 'response', + 'history_key': 'history', 'accelerator': None, 'num_proc': 4, 'stats_export_path': None, @@ -128,6 +134,9 @@ def test_mixture_cfg(self): 'image_key': 'images', 'audio_key': 'audios', 'video_key': 'videos', + 'query_key': 'query', + 'response_key': 'response', + 'history_key': 'history', 'accelerator': None, 'num_proc': 4, 'stats_export_path': None, @@ -146,6 +155,9 @@ def test_mixture_cfg(self): 'image_key': 'images', 'audio_key': 'audios', 'video_key': 'videos', + 'query_key': 'query', + 'response_key': 'response', + 'history_key': 'history', 'accelerator': None, 'num_proc': 4, 'stats_export_path': None, @@ -164,6 +176,9 @@ def test_mixture_cfg(self): 'image_key': 'images', 'audio_key': 'audios', 'video_key': 'videos', + 'query_key': 'query', + 'response_key': 'response', + 'history_key': 'history', 'accelerator': None, 'num_proc': 4, 'stats_export_path': None, @@ -182,6 +197,9 @@ def test_mixture_cfg(self): 'image_key': 'images', 'audio_key': 'audios', 'video_key': 'videos', + 'query_key': 'query', + 'response_key': 'response', + 'history_key': 'history', 'accelerator': None, 'num_proc': 4, 'stats_export_path': None, @@ -200,6 +218,9 @@ def test_mixture_cfg(self): 'image_key': 'images', 'audio_key': 'audios', 'video_key': 'videos', + 'query_key': 'query', + 'response_key': 'response', + 'history_key': 'history', 'accelerator': None, 'num_proc': 4, 'stats_export_path': None, @@ -216,8 +237,8 @@ def test_op_params_parsing(self): from data_juicer.ops.base_op import OPERATORS base_class_params = { - 'text_key', 'image_key', 'audio_key', 'video_key', 'accelerator', - 'num_proc', 'cpu_required', 'mem_required', + 'text_key', 'image_key', 'audio_key', 'video_key', 'query_key', 'response_key', 'history_key', + 'accelerator', 'turbo', 'batch_size', 'num_proc', 'cpu_required', 'mem_required', } parser = ArgumentParser(default_env=True, default_config_files=None) From 87fc4bbea8658049aa95408bcc167f1cf250078e Mon Sep 17 00:00:00 2001 From: null <3213204+drcege@users.noreply.github.com> Date: Tue, 22 Oct 2024 19:29:42 +0800 Subject: [PATCH 03/23] minor fix --- tests/ops/mapper/test_extract_qa_mapper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ops/mapper/test_extract_qa_mapper.py b/tests/ops/mapper/test_extract_qa_mapper.py index a0e3130152..4b46fbe94d 100644 --- a/tests/ops/mapper/test_extract_qa_mapper.py +++ b/tests/ops/mapper/test_extract_qa_mapper.py @@ -15,7 +15,6 @@ def _run_extract_qa(self, samples, enable_vllm=False, sampling_params={}, **kwar op = ExtractQAMapper( hf_model='alibaba-pai/pai-qwen1_5-7b-doc2qa', trust_remote_code=True, - qa_format='chatml', enable_vllm=enable_vllm, sampling_params=sampling_params, **kwargs) From d309428d30f4d1382b922acfa1872d6ed745b967 Mon Sep 17 00:00:00 2001 From: null <3213204+drcege@users.noreply.github.com> Date: Wed, 23 Oct 2024 15:42:26 +0800 Subject: [PATCH 04/23] improve tests assert --- tests/ops/mapper/test_extract_qa_mapper.py | 4 ++++ tests/ops/mapper/test_generate_instruction_mapper.py | 4 ++++ tests/ops/mapper/test_optimize_instruction_mapper.py | 4 +++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/ops/mapper/test_extract_qa_mapper.py b/tests/ops/mapper/test_extract_qa_mapper.py index 4b46fbe94d..384547f40a 100644 --- a/tests/ops/mapper/test_extract_qa_mapper.py +++ b/tests/ops/mapper/test_extract_qa_mapper.py @@ -22,6 +22,10 @@ def _run_extract_qa(self, samples, enable_vllm=False, sampling_params={}, **kwar dataset = dataset.map(op.process, batch_size=2) for row in dataset: logger.info(row) + # Note: If switching models causes this assert to fail, it may not be a code issue; + # the model might just have limited capabilities. + self.assertNotEqual(row[op.query_key], '') + self.assertNotEqual(row[op.response_key], '') def test_extract_qa(self): samples = [ diff --git a/tests/ops/mapper/test_generate_instruction_mapper.py b/tests/ops/mapper/test_generate_instruction_mapper.py index 37dfa9fc2a..e0fee57c28 100644 --- a/tests/ops/mapper/test_generate_instruction_mapper.py +++ b/tests/ops/mapper/test_generate_instruction_mapper.py @@ -28,6 +28,10 @@ def _run_generate_instruction(self, enable_vllm=False): for row in dataset: logger.info(row) + # Note: If switching models causes this assert to fail, it may not be a code issue; + # the model might just have limited capabilities. + self.assertNotEqual(row[op.query_key], '') + self.assertNotEqual(row[op.response_key], '') def test_generate_instruction(self): self._run_generate_instruction() diff --git a/tests/ops/mapper/test_optimize_instruction_mapper.py b/tests/ops/mapper/test_optimize_instruction_mapper.py index ebd1603d0b..94d03c0dff 100644 --- a/tests/ops/mapper/test_optimize_instruction_mapper.py +++ b/tests/ops/mapper/test_optimize_instruction_mapper.py @@ -23,7 +23,9 @@ def _run_optimize_instruction(self, enable_vllm=False): for sample in samples: result = op.process(sample) logger.info(f'Output results: {result}') - self.assertIn(self.query_key, result) + # Note: If switching models causes this assert to fail, it may not be a code issue; + # the model might just have limited capabilities. + self.assertNotEqual(sample[op.query_key], '') def test_optimize_instruction(self): self._run_optimize_instruction() From df3610cfec0be82a51f8203c901863ed4c904994 Mon Sep 17 00:00:00 2001 From: null <3213204+drcege@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:23:15 +0800 Subject: [PATCH 05/23] pre-commit --- configs/config_all.yaml | 6 +- data_juicer/ops/filter/__init__.py | 62 ++++++------------- data_juicer/ops/mapper/__init__.py | 22 +++---- ...py => generate_qa_from_examples_mapper.py} | 7 ++- ...per.py => generate_qa_from_text_mapper.py} | 8 +-- ...ion_mapper.py => optimize_query_mapper.py} | 6 +- data_juicer/utils/auto_install_mapping.py | 16 ++--- docs/Operators.md | 8 +-- docs/Operators_ZH.md | 8 +-- ... test_generate_qa_from_examples_mapper.py} | 6 +- ...y => test_generate_qa_from_text_mapper.py} | 6 +- ...apper.py => test_optimize_query_mapper.py} | 6 +- 12 files changed, 68 insertions(+), 93 deletions(-) rename data_juicer/ops/mapper/{generate_instruction_mapper.py => generate_qa_from_examples_mapper.py} (98%) rename data_juicer/ops/mapper/{extract_qa_mapper.py => generate_qa_from_text_mapper.py} (97%) rename data_juicer/ops/mapper/{optimize_instruction_mapper.py => optimize_query_mapper.py} (97%) rename tests/ops/mapper/{test_generate_instruction_mapper.py => test_generate_qa_from_examples_mapper.py} (87%) rename tests/ops/mapper/{test_extract_qa_mapper.py => test_generate_qa_from_text_mapper.py} (90%) rename tests/ops/mapper/{test_optimize_instruction_mapper.py => test_optimize_query_mapper.py} (86%) diff --git a/configs/config_all.yaml b/configs/config_all.yaml index c5b4e803a4..bd7109936d 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -61,7 +61,7 @@ process: - clean_links_mapper: # remove web links from text. - clean_copyright_mapper: # remove copyright comments. - expand_macro_mapper: # expand macro definitions in Latex text. - - extract_qa_mapper: # mapper to extract question and answer pair from text. + - generate_qa_from_text_mapper: # mapper to generate question and answer pairs from text. hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa' # model name on huggingface to extract question and answer pair. pattern: null # regular expression pattern to search for within text. enable_vllm: true # Whether to use vllm for inference acceleration. @@ -70,7 +70,7 @@ process: max_num_seqs: 256 # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration. sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} - fix_unicode_mapper: # fix unicode errors in text. - - generate_instruction_mapper: # generate new instruction text data. + - generate_qa_from_examples_mapper: # mapper to generate question and answer pairs from examples. hf_model: 'Qwen/Qwen-7B-Chat' # model name on huggingface to generate instruction. seed_file: 'demos/data/demo-dataset-chatml.jsonl' # Seed file as instruction samples to generate new instructions, chatml format. instruct_num: 3 # the number of generated samples. @@ -145,7 +145,7 @@ process: delete_random_char: false # whether to open the augmentation method of deleting random characters from the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强" swap_random_char: false # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法" replace_equivalent_num: false # whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法" - - optimize_instruction_mapper: # optimize instruction. + - optimize_query_mapper: # optimize instruction query. hf_model: 'alibaba-pai/Qwen2-7B-Instruct-Refine' # model name on huggingface to optimize instruction enable_vllm: true # whether to use vllm for inference acceleration. tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism. diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py index 68e9ba521f..338870adde 100644 --- a/data_juicer/ops/filter/__init__.py +++ b/data_juicer/ops/filter/__init__.py @@ -64,49 +64,23 @@ from .word_repetition_filter import WordRepetitionFilter from .words_num_filter import WordsNumFilter +# yapf: enable + __all__ = [ - 'ImageTextSimilarityFilter', - 'VideoAspectRatioFilter', - 'ImageTextMatchingFilter', - 'ImageNSFWFilter', - 'TokenNumFilter', - 'TextLengthFilter', - 'SpecifiedNumericFieldFilter', - 'AudioNMFSNRFilter', - 'VideoAestheticsFilter', - 'PerplexityFilter', - 'PhraseGroundingRecallFilter', - 'MaximumLineLengthFilter', - 'AverageLineLengthFilter', - 'SpecifiedFieldFilter', - 'VideoTaggingFromFramesFilter', - 'TextEntityDependencyFilter', - 'VideoResolutionFilter', - 'AlphanumericFilter', - 'ImageWatermarkFilter', - 'ImageAestheticsFilter', - 'AudioSizeFilter', - 'StopWordsFilter', - 'CharacterRepetitionFilter', - 'ImageShapeFilter', - 'VideoDurationFilter', - 'TextActionFilter', - 'VideoOcrAreaRatioFilter', - 'VideoNSFWFilter', - 'SpecialCharactersFilter', - 'VideoFramesTextSimilarityFilter', - 'ImageAspectRatioFilter', - 'AudioDurationFilter', - 'LanguageIDScoreFilter', - 'SuffixFilter', - 'ImageSizeFilter', - 'VideoWatermarkFilter', - 'WordsNumFilter', - 'ImageFaceRatioFilter', - 'FlaggedWordFilter', - 'WordRepetitionFilter', - 'VideoMotionScoreFilter', - 'ImagePairSimilarityFilter' + 'ImageTextSimilarityFilter', 'VideoAspectRatioFilter', + 'ImageTextMatchingFilter', 'ImageNSFWFilter', 'TokenNumFilter', + 'TextLengthFilter', 'SpecifiedNumericFieldFilter', 'AudioNMFSNRFilter', + 'VideoAestheticsFilter', 'PerplexityFilter', 'PhraseGroundingRecallFilter', + 'MaximumLineLengthFilter', 'AverageLineLengthFilter', + 'SpecifiedFieldFilter', 'VideoTaggingFromFramesFilter', + 'TextEntityDependencyFilter', 'VideoResolutionFilter', + 'AlphanumericFilter', 'ImageWatermarkFilter', 'ImageAestheticsFilter', + 'AudioSizeFilter', 'StopWordsFilter', 'CharacterRepetitionFilter', + 'ImageShapeFilter', 'VideoDurationFilter', 'TextActionFilter', + 'VideoOcrAreaRatioFilter', 'VideoNSFWFilter', 'SpecialCharactersFilter', + 'VideoFramesTextSimilarityFilter', 'ImageAspectRatioFilter', + 'AudioDurationFilter', 'LanguageIDScoreFilter', 'SuffixFilter', + 'ImageSizeFilter', 'VideoWatermarkFilter', 'WordsNumFilter', + 'ImageFaceRatioFilter', 'FlaggedWordFilter', 'WordRepetitionFilter', + 'VideoMotionScoreFilter', 'ImagePairSimilarityFilter' ] - -# yapf: enable diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py index eb814b374a..b61997c11c 100644 --- a/data_juicer/ops/mapper/__init__.py +++ b/data_juicer/ops/mapper/__init__.py @@ -2,12 +2,12 @@ from . import (audio_ffmpeg_wrapped_mapper, chinese_convert_mapper, clean_copyright_mapper, clean_email_mapper, clean_html_mapper, clean_ip_mapper, clean_links_mapper, expand_macro_mapper, - extract_qa_mapper, fix_unicode_mapper, - generate_instruction_mapper, image_blur_mapper, + fix_unicode_mapper, generate_qa_from_examples_mapper, + generate_qa_from_text_mapper, image_blur_mapper, image_captioning_from_gpt4v_mapper, image_captioning_mapper, image_diffusion_mapper, image_face_blur_mapper, image_tagging_mapper, nlpaug_en_mapper, nlpcda_zh_mapper, - optimize_instruction_mapper, punctuation_normalization_mapper, + optimize_query_mapper, punctuation_normalization_mapper, remove_bibliography_mapper, remove_comments_mapper, remove_header_mapper, remove_long_words_mapper, remove_non_chinese_character_mapper, @@ -34,9 +34,9 @@ from .clean_ip_mapper import CleanIpMapper from .clean_links_mapper import CleanLinksMapper from .expand_macro_mapper import ExpandMacroMapper -from .extract_qa_mapper import ExtractQAMapper from .fix_unicode_mapper import FixUnicodeMapper -from .generate_instruction_mapper import GenerateInstructionMapper +from .generate_qa_from_examples_mapper import GenerateQAFromExamplesMapper +from .generate_qa_from_text_mapper import GenerateQAFromTextMapper from .image_blur_mapper import ImageBlurMapper from .image_captioning_from_gpt4v_mapper import ImageCaptioningFromGPT4VMapper from .image_captioning_mapper import ImageCaptioningMapper @@ -45,7 +45,7 @@ from .image_tagging_mapper import ImageTaggingMapper from .nlpaug_en_mapper import NlpaugEnMapper from .nlpcda_zh_mapper import NlpcdaZhMapper -from .optimize_instruction_mapper import OptimizeInstructionMapper +from .optimize_query_mapper import OptimizeQueryMapper from .punctuation_normalization_mapper import PunctuationNormalizationMapper from .remove_bibliography_mapper import RemoveBibliographyMapper from .remove_comments_mapper import RemoveCommentsMapper @@ -78,6 +78,8 @@ from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper from .whitespace_normalization_mapper import WhitespaceNormalizationMapper +# yapf: enable + __all__ = [ 'VideoCaptioningFromAudioMapper', 'VideoTaggingFromAudioMapper', @@ -97,7 +99,7 @@ 'VideoFFmpegWrappedMapper', 'ChineseConvertMapper', 'NlpcdaZhMapper', - 'OptimizeInstructionMapper', + 'OptimizeQueryMapper', 'ImageBlurMapper', 'CleanCopyrightMapper', 'RemoveNonChineseCharacterlMapper', @@ -109,12 +111,12 @@ 'VideoTaggingFromFramesMapper', 'RemoveCommentsMapper', 'ExpandMacroMapper', - 'ExtractQAMapper', + 'GenerateQAFromExamplesMapper', + 'GenerateQAFromTextMapper', 'ImageCaptioningMapper', 'RemoveWordsWithIncorrectSubstringsMapper', 'VideoCaptioningFromVideoMapper', 'VideoCaptioningFromSummarizerMapper', - 'GenerateInstructionMapper', 'FixUnicodeMapper', 'NlpaugEnMapper', 'VideoCaptioningFromFramesMapper', @@ -127,5 +129,3 @@ 'VideoFaceBlurMapper', 'ImageTaggingMapper', ] - -# yapf: enable diff --git a/data_juicer/ops/mapper/generate_instruction_mapper.py b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py similarity index 98% rename from data_juicer/ops/mapper/generate_instruction_mapper.py rename to data_juicer/ops/mapper/generate_qa_from_examples_mapper.py index 29ee60c5e5..1a91751f8e 100644 --- a/data_juicer/ops/mapper/generate_instruction_mapper.py +++ b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py @@ -27,14 +27,15 @@ EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n\n{qa_pairs}' QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n' -OP_NAME = 'generate_instruction_mapper' +OP_NAME = 'generate_qa_from_examples_mapper' # TODO: Extend LLM-based OPs into API-based implementation. @UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) -class GenerateInstructionMapper(Mapper): - """Mapper to generate new instruction text data. +class GenerateQAFromExamplesMapper(Mapper): + """ + Mapper to generate question and answer pairs from examples. You should configure an empty dataset in your yaml config file: ``` generated_dataset_config: diff --git a/data_juicer/ops/mapper/extract_qa_mapper.py b/data_juicer/ops/mapper/generate_qa_from_text_mapper.py similarity index 97% rename from data_juicer/ops/mapper/extract_qa_mapper.py rename to data_juicer/ops/mapper/generate_qa_from_text_mapper.py index c4f54b37b3..3c5c74df14 100644 --- a/data_juicer/ops/mapper/extract_qa_mapper.py +++ b/data_juicer/ops/mapper/generate_qa_from_text_mapper.py @@ -10,15 +10,15 @@ torch = LazyLoader('torch', 'torch') vllm = LazyLoader('vllm', 'vllm') -OP_NAME = 'extract_qa_mapper' +OP_NAME = 'generate_qa_from_text_mapper' # TODO: Extend LLM-based OPs into API-based implementation. @UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) -class ExtractQAMapper(Mapper): +class GenerateQAFromTextMapper(Mapper): """ - Mapper to extract question and answer pair from text samples. + Mapper to generate question and answer pairs from text. Recommended model list: [ 'alibaba-pai/pai-llama3-8b-doc2qa', 'alibaba-pai/pai-baichuan2-7b-doc2qa', @@ -117,7 +117,7 @@ def _extract_qa(self, output): pat = re.compile(self.pattern, re.DOTALL) qa_pairs = pat.findall(output) - for _, qa in enumerate(qa_pairs, 1): + for qa in qa_pairs: user, assistant = qa qa_list.append((user.strip(), assistant.strip())) diff --git a/data_juicer/ops/mapper/optimize_instruction_mapper.py b/data_juicer/ops/mapper/optimize_query_mapper.py similarity index 97% rename from data_juicer/ops/mapper/optimize_instruction_mapper.py rename to data_juicer/ops/mapper/optimize_query_mapper.py index 1bf0e93193..9883a5e26b 100644 --- a/data_juicer/ops/mapper/optimize_instruction_mapper.py +++ b/data_juicer/ops/mapper/optimize_query_mapper.py @@ -11,14 +11,14 @@ DEFAULT_SYSTEM_PROMPT = '请优化这个指令,将其修改为一个更详细具体的指令。' -OP_NAME = 'optimize_instruction_mapper' +OP_NAME = 'optimize_query_mapper' # TODO: Extend LLM-based OPs into API-based implementation. @UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) -class OptimizeInstructionMapper(Mapper): - """Mapper to optimize instruction. +class OptimizeQueryMapper(Mapper): + """Mapper to optimize instruction query. Recommended model list: [ alibaba-pai/Qwen2-1.5B-Instruct-Refine alibaba-pai/Qwen2-7B-Instruct-Refine diff --git a/data_juicer/utils/auto_install_mapping.py b/data_juicer/utils/auto_install_mapping.py index 2d6db9625e..d8e919cb02 100644 --- a/data_juicer/utils/auto_install_mapping.py +++ b/data_juicer/utils/auto_install_mapping.py @@ -18,10 +18,10 @@ 'image_watermark_filter', 'phrase_grounding_recall_filter', 'video_aesthetics_filter', 'video_frames_text_similarity_filter', 'video_nsfw_filter', 'video_tagging_from_frames_filter', - 'video_watermark_filter', 'extract_qa_mapper', - 'generate_instruction_mapper', 'image_captioning_mapper', + 'video_watermark_filter', 'generate_qa_from_text_mapper', + 'generate_qa_from_examples_mapper', 'image_captioning_mapper', 'image_diffusion_mapper', 'image_tagging_mapper', - 'optimize_instruction_mapper', 'video_captioning_from_frames_mapper', + 'optimize_query_mapper', 'video_captioning_from_frames_mapper', 'video_captioning_from_summarizer_mapper', 'video_captioning_from_video_mapper', 'video_tagging_from_audio_mapper', 'video_tagging_from_frames_mapper' @@ -56,9 +56,9 @@ 'image_watermark_filter', 'phrase_grounding_recall_filter', 'token_num_filter', 'video_aesthetics_filter', 'video_frames_text_similarity_filter', 'video_nsfw_filter', - 'extract_qa_mapper', 'generate_instruction_mapper', + 'generate_qa_from_text_mapper', 'generate_qa_from_examples_mapper', 'image_captioning_mapper', 'image_diffusion_mapper', - 'optimize_instruction_mapper', 'video_captioning_from_audio_mapper', + 'optimize_query_mapper', 'video_captioning_from_audio_mapper', 'video_captioning_from_frames_mapper', 'video_captioning_from_summarizer_mapper', 'video_captioning_from_video_mapper', 'video_tagging_from_audio_mapper' @@ -96,9 +96,9 @@ 'video_remove_watermark_mapper' ], 'vllm': [ - 'extract_qa_mapper', 'generate_instruction_mapper', - 'optimize_instruction_mapper' + 'generate_qa_from_text_mapper', 'generate_qa_from_examples_mapper', + 'optimize_query_mapper' ], - 'rouge': ['generate_instruction_mapper'], + 'rouge': ['generate_qa_from_examples_mapper'], 'ram': ['image_tagging_mapper', 'video_tagging_from_frames_mapper'] } diff --git a/docs/Operators.md b/docs/Operators.md index dd56871c4b..acfd1654f3 100644 --- a/docs/Operators.md +++ b/docs/Operators.md @@ -58,18 +58,18 @@ All the specific operators are listed below, each featured with several capabili | clean_ip_mapper | General | en, zh | Removes IP addresses | | clean_links_mapper | General, Code | en, zh | Removes links, such as those starting with http or ftp | | expand_macro_mapper | LaTeX | en, zh | Expands macros usually defined at the top of TeX documents | -| extract_qa_mapper | General | en, zh | Extract question and answer pair from text samples. | +| generate_qa_from_text_mapper | General | en, zh | Generate question and answer pairs from text samples. | | fix_unicode_mapper | General | en, zh | Fixes broken Unicodes (by [ftfy](https://ftfy.readthedocs.io/)) | -| generate_instruction_mapper | General | en, zh | Generate instruction text samples. | +| generate_qa_from_examples_mapper | General | en, zh | Generate question and answer pairs from examples samples. | | image_blur_mapper | Image | - | Blur images | | image_captioning_from_gpt4v_mapper | Multimodal | - | generate samples whose texts are generated based on gpt-4-visison and the image | | image_captioning_mapper | Multimodal | - | generate samples whose captions are generated based on another model (such as blip2) and the figure within the original sample | | image_diffusion_mapper | Multimodal | - | Generate and augment images by stable diffusion model | | image_face_blur_mapper | Image | - | Blur faces detected in images | -| image_tagging_mapper | Multimodal | - | Mapper to generate image tags from the input images. | +| image_tagging_mapper | Multimodal | - | Mapper to generate image tags from the input images. | | nlpaug_en_mapper | General | en | Simply augments texts in English based on the `nlpaug` library | | nlpcda_zh_mapper | General | zh | Simply augments texts in Chinese based on the `nlpcda` library | -| optimize_instruction_mapper | General | en, zh | Optimize instruction text samples. | +| optimize_query_mapper | General | en, zh | Optimize instruction query samples. | | punctuation_normalization_mapper | General | en, zh | Normalizes various Unicode punctuations to their ASCII equivalents | | remove_bibliography_mapper | LaTeX | en, zh | Removes the bibliography of TeX documents | | remove_comments_mapper | LaTeX | en, zh | Removes the comments of TeX documents | diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md index f5d598f540..c548c4208b 100644 --- a/docs/Operators_ZH.md +++ b/docs/Operators_ZH.md @@ -57,18 +57,18 @@ Data-Juicer 中的算子分为以下 5 种类型。 | clean_ip_mapper | General | en, zh | 删除 IP 地址 | | clean_links_mapper | General, Code | en, zh | 删除链接,例如以 http 或 ftp 开头的 | | expand_macro_mapper | LaTeX | en, zh | 扩展通常在 TeX 文档顶部定义的宏 | -| extract_qa_mapper | General | en, zh | 从文本中抽取问答对 | +| generate_qa_from_text_mapper | General | en, zh | 从文本中生成问答对 | | fix_unicode_mapper | General | en, zh | 修复损坏的 Unicode(借助 [ftfy](https://ftfy.readthedocs.io/)) | -| generate_instruction_mapper | General | en, zh | 指令扩充,根据种子数据,生成新的样本。 | +| generate_qa_from_examples_mapper | General | en, zh | 根据种子数据,生成新的样本。 | | image_blur_mapper | Image | - | 对图像进行模糊处理 | | image_captioning_from_gpt4v_mapper | Multimodal | - | 基于gpt-4-vision和图像生成文本 | | image_captioning_mapper | Multimodal | - | 生成样本,其标题是根据另一个辅助模型(例如 blip2)和原始样本中的图形生成的。 | | image_diffusion_mapper | Multimodal | - | 用stable diffusion生成图像,对图像进行增强 | | image_face_blur_mapper | Image | - | 对图像中的人脸进行模糊处理 | -| image_tagging_mapper | Multimodal | - | 从输入图片中生成图片标签 | +| image_tagging_mapper | Multimodal | - | 从输入图片中生成图片标签 | | nlpaug_en_mapper | General | en | 使用`nlpaug`库对英语文本进行简单增强 | | nlpcda_zh_mapper | General | zh | 使用`nlpcda`库对中文文本进行简单增强 | -| optimize_instruction_mapper | General | en, zh | 指令优化,优化prompt。 | +| optimize_query_mapper | General | en, zh | 指令优化,优化 query | | punctuation_normalization_mapper | General | en, zh | 将各种 Unicode 标点符号标准化为其 ASCII 等效项 | | remove_bibliography_mapper | LaTeX | en, zh | 删除 TeX 文档的参考文献 | | remove_comments_mapper | LaTeX | en, zh | 删除 TeX 文档中的注释 | diff --git a/tests/ops/mapper/test_generate_instruction_mapper.py b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py similarity index 87% rename from tests/ops/mapper/test_generate_instruction_mapper.py rename to tests/ops/mapper/test_generate_qa_from_examples_mapper.py index e0fee57c28..90e25a787f 100644 --- a/tests/ops/mapper/test_generate_instruction_mapper.py +++ b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py @@ -1,19 +1,19 @@ import unittest import json from loguru import logger -from data_juicer.ops.mapper.generate_instruction_mapper import GenerateInstructionMapper +from data_juicer.ops.mapper.generate_qa_from_examples_mapper import GenerateQAFromExamplesMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) # Skip tests for this OP in the GitHub actions due to disk space limitation. # These tests have been tested locally. @SKIPPED_TESTS.register_module() -class GenerateInstructionMapperTest(DataJuicerTestCaseBase): +class GenerateQAFromExamplesMapperTest(DataJuicerTestCaseBase): text_key = 'text' def _run_generate_instruction(self, enable_vllm=False): - op = GenerateInstructionMapper( + op = GenerateQAFromExamplesMapper( hf_model='Qwen/Qwen-7B-Chat', seed_file='demos/data/demo-dataset-chatml.jsonl', instruct_num=2, diff --git a/tests/ops/mapper/test_extract_qa_mapper.py b/tests/ops/mapper/test_generate_qa_from_text_mapper.py similarity index 90% rename from tests/ops/mapper/test_extract_qa_mapper.py rename to tests/ops/mapper/test_generate_qa_from_text_mapper.py index 384547f40a..e9e822ab23 100644 --- a/tests/ops/mapper/test_extract_qa_mapper.py +++ b/tests/ops/mapper/test_generate_qa_from_text_mapper.py @@ -1,18 +1,18 @@ import unittest from loguru import logger from data_juicer.core.data import NestedDataset as Dataset -from data_juicer.ops.mapper.extract_qa_mapper import ExtractQAMapper +from data_juicer.ops.mapper.generate_qa_from_text_mapper import GenerateQAFromTextMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) # Skip tests for this OP in the GitHub actions due to disk space limitation. # These tests have been tested locally. @SKIPPED_TESTS.register_module() -class ExtractQAMapperTest(DataJuicerTestCaseBase): +class GenerateQAFromTextMapperTest(DataJuicerTestCaseBase): text_key = 'text' def _run_extract_qa(self, samples, enable_vllm=False, sampling_params={}, **kwargs): - op = ExtractQAMapper( + op = GenerateQAFromTextMapper( hf_model='alibaba-pai/pai-qwen1_5-7b-doc2qa', trust_remote_code=True, enable_vllm=enable_vllm, diff --git a/tests/ops/mapper/test_optimize_instruction_mapper.py b/tests/ops/mapper/test_optimize_query_mapper.py similarity index 86% rename from tests/ops/mapper/test_optimize_instruction_mapper.py rename to tests/ops/mapper/test_optimize_query_mapper.py index 94d03c0dff..ff35c6eacf 100644 --- a/tests/ops/mapper/test_optimize_instruction_mapper.py +++ b/tests/ops/mapper/test_optimize_query_mapper.py @@ -1,17 +1,17 @@ import unittest from loguru import logger -from data_juicer.ops.mapper.optimize_instruction_mapper import OptimizeInstructionMapper +from data_juicer.ops.mapper.optimize_query_mapper import OptimizeQueryMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) # Skip tests for this OP in the GitHub actions due to disk space limitation. # These tests have been tested locally. @SKIPPED_TESTS.register_module() -class OptimizeInstructionMapperTest(DataJuicerTestCaseBase): +class OptimizeQueryMapperTest(DataJuicerTestCaseBase): query_key = 'query' def _run_optimize_instruction(self, enable_vllm=False): - op = OptimizeInstructionMapper( + op = OptimizeQueryMapper( hf_model='alibaba-pai/Qwen2-7B-Instruct-Refine', enable_vllm=enable_vllm ) From cf521cc9f27245c22b7b5b3ba432854b22a27a21 Mon Sep 17 00:00:00 2001 From: null <3213204+drcege@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:24:52 +0800 Subject: [PATCH 06/23] sort --- data_juicer/ops/common/__init__.py | 11 ++-- data_juicer/ops/deduplicator/__init__.py | 7 +-- data_juicer/ops/filter/__init__.py | 32 +++++------ data_juicer/ops/mapper/__init__.py | 67 +++++++----------------- 4 files changed, 43 insertions(+), 74 deletions(-) diff --git a/data_juicer/ops/common/__init__.py b/data_juicer/ops/common/__init__.py index 74e8dd33db..1493b9ee59 100644 --- a/data_juicer/ops/common/__init__.py +++ b/data_juicer/ops/common/__init__.py @@ -5,12 +5,7 @@ from .special_characters import SPECIAL_CHARACTERS __all__ = [ - 'get_sentences_from_document', - 'get_words_from_document', - 'merge_on_whitespace_tab_newline', - 'split_on_newline_tab_whitespace', - 'split_on_whitespace', - 'strip', - 'words_augmentation', - 'words_refinement', + 'get_sentences_from_document', 'get_words_from_document', + 'merge_on_whitespace_tab_newline', 'split_on_newline_tab_whitespace', + 'split_on_whitespace', 'strip', 'words_augmentation', 'words_refinement' ] diff --git a/data_juicer/ops/deduplicator/__init__.py b/data_juicer/ops/deduplicator/__init__.py index 69f73b3612..dd549d02ea 100644 --- a/data_juicer/ops/deduplicator/__init__.py +++ b/data_juicer/ops/deduplicator/__init__.py @@ -13,7 +13,8 @@ from .video_deduplicator import VideoDeduplicator __all__ = [ - 'VideoDeduplicator', 'RayBasicDeduplicator', 'DocumentMinhashDeduplicator', - 'RayImageDeduplicator', 'RayDocumentDeduplicator', 'DocumentDeduplicator', - 'ImageDeduplicator', 'DocumentSimhashDeduplicator', 'RayVideoDeduplicator' + 'DocumentDeduplicator', 'DocumentMinhashDeduplicator', + 'DocumentSimhashDeduplicator', 'ImageDeduplicator', 'RayBasicDeduplicator', + 'RayDocumentDeduplicator', 'RayImageDeduplicator', 'RayVideoDeduplicator', + 'VideoDeduplicator' ] diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py index 338870adde..2aabc8d9c2 100644 --- a/data_juicer/ops/filter/__init__.py +++ b/data_juicer/ops/filter/__init__.py @@ -67,20 +67,20 @@ # yapf: enable __all__ = [ - 'ImageTextSimilarityFilter', 'VideoAspectRatioFilter', - 'ImageTextMatchingFilter', 'ImageNSFWFilter', 'TokenNumFilter', - 'TextLengthFilter', 'SpecifiedNumericFieldFilter', 'AudioNMFSNRFilter', - 'VideoAestheticsFilter', 'PerplexityFilter', 'PhraseGroundingRecallFilter', - 'MaximumLineLengthFilter', 'AverageLineLengthFilter', - 'SpecifiedFieldFilter', 'VideoTaggingFromFramesFilter', - 'TextEntityDependencyFilter', 'VideoResolutionFilter', - 'AlphanumericFilter', 'ImageWatermarkFilter', 'ImageAestheticsFilter', - 'AudioSizeFilter', 'StopWordsFilter', 'CharacterRepetitionFilter', - 'ImageShapeFilter', 'VideoDurationFilter', 'TextActionFilter', - 'VideoOcrAreaRatioFilter', 'VideoNSFWFilter', 'SpecialCharactersFilter', - 'VideoFramesTextSimilarityFilter', 'ImageAspectRatioFilter', - 'AudioDurationFilter', 'LanguageIDScoreFilter', 'SuffixFilter', - 'ImageSizeFilter', 'VideoWatermarkFilter', 'WordsNumFilter', - 'ImageFaceRatioFilter', 'FlaggedWordFilter', 'WordRepetitionFilter', - 'VideoMotionScoreFilter', 'ImagePairSimilarityFilter' + 'AlphanumericFilter', 'AudioDurationFilter', 'AudioNMFSNRFilter', + 'AudioSizeFilter', 'AverageLineLengthFilter', 'CharacterRepetitionFilter', + 'FlaggedWordFilter', 'ImageAestheticsFilter', 'ImageAspectRatioFilter', + 'ImageFaceRatioFilter', 'ImageNSFWFilter', 'ImagePairSimilarityFilter', + 'ImageShapeFilter', 'ImageSizeFilter', 'ImageTextMatchingFilter', + 'ImageTextSimilarityFilter', 'ImageWatermarkFilter', + 'LanguageIDScoreFilter', 'MaximumLineLengthFilter', 'PerplexityFilter', + 'PhraseGroundingRecallFilter', 'SpecialCharactersFilter', + 'SpecifiedFieldFilter', 'SpecifiedNumericFieldFilter', 'StopWordsFilter', + 'SuffixFilter', 'TextActionFilter', 'TextEntityDependencyFilter', + 'TextLengthFilter', 'TokenNumFilter', 'VideoAestheticsFilter', + 'VideoAspectRatioFilter', 'VideoDurationFilter', + 'VideoFramesTextSimilarityFilter', 'VideoMotionScoreFilter', + 'VideoNSFWFilter', 'VideoOcrAreaRatioFilter', 'VideoResolutionFilter', + 'VideoTaggingFromFramesFilter', 'VideoWatermarkFilter', + 'WordRepetitionFilter', 'WordsNumFilter' ] diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py index b61997c11c..8d06c1c914 100644 --- a/data_juicer/ops/mapper/__init__.py +++ b/data_juicer/ops/mapper/__init__.py @@ -81,51 +81,24 @@ # yapf: enable __all__ = [ - 'VideoCaptioningFromAudioMapper', - 'VideoTaggingFromAudioMapper', - 'ImageCaptioningFromGPT4VMapper', - 'PunctuationNormalizationMapper', - 'RemoveBibliographyMapper', - 'SentenceSplitMapper', - 'VideoSplitBySceneMapper', - 'CleanIpMapper', - 'CleanLinksMapper', - 'RemoveHeaderMapper', - 'RemoveTableTextMapper', - 'VideoRemoveWatermarkMapper', - 'RemoveRepeatSentencesMapper', - 'ImageDiffusionMapper', - 'ImageFaceBlurMapper', - 'VideoFFmpegWrappedMapper', - 'ChineseConvertMapper', - 'NlpcdaZhMapper', - 'OptimizeQueryMapper', - 'ImageBlurMapper', - 'CleanCopyrightMapper', - 'RemoveNonChineseCharacterlMapper', - 'VideoSplitByKeyFrameMapper', - 'RemoveSpecificCharsMapper', - 'VideoResizeAspectRatioMapper', - 'CleanHtmlMapper', - 'WhitespaceNormalizationMapper', - 'VideoTaggingFromFramesMapper', - 'RemoveCommentsMapper', - 'ExpandMacroMapper', - 'GenerateQAFromExamplesMapper', - 'GenerateQAFromTextMapper', - 'ImageCaptioningMapper', - 'RemoveWordsWithIncorrectSubstringsMapper', - 'VideoCaptioningFromVideoMapper', - 'VideoCaptioningFromSummarizerMapper', - 'FixUnicodeMapper', - 'NlpaugEnMapper', - 'VideoCaptioningFromFramesMapper', - 'RemoveLongWordsMapper', - 'VideoResizeResolutionMapper', - 'CleanEmailMapper', - 'ReplaceContentMapper', - 'AudioFFmpegWrappedMapper', - 'VideoSplitByDurationMapper', - 'VideoFaceBlurMapper', - 'ImageTaggingMapper', + 'AudioFFmpegWrappedMapper', 'ChineseConvertMapper', 'CleanCopyrightMapper', + 'CleanEmailMapper', 'CleanHtmlMapper', 'CleanIpMapper', 'CleanLinksMapper', + 'ExpandMacroMapper', 'FixUnicodeMapper', 'GenerateQAFromExamplesMapper', + 'GenerateQAFromTextMapper', 'ImageBlurMapper', + 'ImageCaptioningFromGPT4VMapper', 'ImageCaptioningMapper', + 'ImageDiffusionMapper', 'ImageFaceBlurMapper', 'ImageTaggingMapper', + 'NlpaugEnMapper', 'NlpcdaZhMapper', 'OptimizeQueryMapper', + 'PunctuationNormalizationMapper', 'RemoveBibliographyMapper', + 'RemoveCommentsMapper', 'RemoveHeaderMapper', 'RemoveLongWordsMapper', + 'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper', + 'RemoveSpecificCharsMapper', 'RemoveTableTextMapper', + 'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper', + 'SentenceSplitMapper', 'VideoCaptioningFromAudioMapper', + 'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper', + 'VideoCaptioningFromVideoMapper', 'VideoFFmpegWrappedMapper', + 'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper', + 'VideoResizeAspectRatioMapper', 'VideoResizeResolutionMapper', + 'VideoSplitByDurationMapper', 'VideoSplitByKeyFrameMapper', + 'VideoSplitBySceneMapper', 'VideoTaggingFromAudioMapper', + 'VideoTaggingFromFramesMapper', 'WhitespaceNormalizationMapper' ] From cc39ef78ae7fa0c3ca5a1eee2de3e577fd710ea0 Mon Sep 17 00:00:00 2001 From: null <3213204+drcege@users.noreply.github.com> Date: Thu, 24 Oct 2024 15:50:46 +0800 Subject: [PATCH 07/23] add associated ops --- data_juicer/ops/mapper/__init__.py | 8 +- .../ops/mapper/optimize_query_mapper.py | 90 +++++-------------- 2 files changed, 28 insertions(+), 70 deletions(-) diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py index 8d06c1c914..ca5adfe993 100644 --- a/data_juicer/ops/mapper/__init__.py +++ b/data_juicer/ops/mapper/__init__.py @@ -7,7 +7,8 @@ image_captioning_from_gpt4v_mapper, image_captioning_mapper, image_diffusion_mapper, image_face_blur_mapper, image_tagging_mapper, nlpaug_en_mapper, nlpcda_zh_mapper, - optimize_query_mapper, punctuation_normalization_mapper, + optimize_qa_mapper, optimize_query_mapper, + optimize_response_mapper, punctuation_normalization_mapper, remove_bibliography_mapper, remove_comments_mapper, remove_header_mapper, remove_long_words_mapper, remove_non_chinese_character_mapper, @@ -45,7 +46,9 @@ from .image_tagging_mapper import ImageTaggingMapper from .nlpaug_en_mapper import NlpaugEnMapper from .nlpcda_zh_mapper import NlpcdaZhMapper +from .optimize_qa_mapper import OptimizeQAMapper from .optimize_query_mapper import OptimizeQueryMapper +from .optimize_response_mapper import OptimizeResponseMapper from .punctuation_normalization_mapper import PunctuationNormalizationMapper from .remove_bibliography_mapper import RemoveBibliographyMapper from .remove_comments_mapper import RemoveCommentsMapper @@ -87,7 +90,8 @@ 'GenerateQAFromTextMapper', 'ImageBlurMapper', 'ImageCaptioningFromGPT4VMapper', 'ImageCaptioningMapper', 'ImageDiffusionMapper', 'ImageFaceBlurMapper', 'ImageTaggingMapper', - 'NlpaugEnMapper', 'NlpcdaZhMapper', 'OptimizeQueryMapper', + 'NlpaugEnMapper', 'NlpcdaZhMapper', 'OptimizeQAMapper', + 'OptimizeQueryMapper', 'OptimizeResponseMapper', 'PunctuationNormalizationMapper', 'RemoveBibliographyMapper', 'RemoveCommentsMapper', 'RemoveHeaderMapper', 'RemoveLongWordsMapper', 'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper', diff --git a/data_juicer/ops/mapper/optimize_query_mapper.py b/data_juicer/ops/mapper/optimize_query_mapper.py index 9883a5e26b..255f1eebc6 100644 --- a/data_juicer/ops/mapper/optimize_query_mapper.py +++ b/data_juicer/ops/mapper/optimize_query_mapper.py @@ -1,32 +1,29 @@ from typing import Dict, Optional -from loguru import logger - -from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper +from data_juicer.ops.base_op import OPERATORS, UNFORKABLE +from data_juicer.ops.mapper import OptimizeQAMapper from data_juicer.utils.lazy_loader import LazyLoader -from data_juicer.utils.model_utils import get_model, prepare_model torch = LazyLoader('torch', 'torch') vllm = LazyLoader('vllm', 'vllm') -DEFAULT_SYSTEM_PROMPT = '请优化这个指令,将其修改为一个更详细具体的指令。' - OP_NAME = 'optimize_query_mapper' # TODO: Extend LLM-based OPs into API-based implementation. @UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) -class OptimizeQueryMapper(Mapper): - """Mapper to optimize instruction query. - Recommended model list: [ - alibaba-pai/Qwen2-1.5B-Instruct-Refine - alibaba-pai/Qwen2-7B-Instruct-Refine - ] +class OptimizeQueryMapper(OptimizeQAMapper): + """ + Mapper to optimize only query in question-answer pairs. """ + + DEFAULT_SYSTEM_PROMPT = '优化问答对中的问题,将其更加详细具体,但仍可以由原答案回答。只输出优化后的问题,不要输出多余内容。' + _accelerator = 'cuda' def __init__(self, + *, hf_model: str = 'alibaba-pai/Qwen2-7B-Instruct-Refine', trust_remote_code: bool = False, system_prompt: Optional[str] = None, @@ -35,7 +32,6 @@ def __init__(self, max_model_len: Optional[int] = None, max_num_seqs: int = 256, sampling_params: Dict = {}, - *args, **kwargs): """ Initialization method. @@ -56,60 +52,18 @@ def __init__(self, :param args: extra args :param kwargs: extra args """ - super().__init__(*args, **kwargs) - self.num_proc = 1 - - if system_prompt is None: - system_prompt = DEFAULT_SYSTEM_PROMPT - self.system_prompt = system_prompt - self.enable_vllm = enable_vllm - - if enable_vllm: - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if not tensor_parallel_size: - tensor_parallel_size = torch.cuda.device_count() - logger.info(f'Set tensor_parallel_size to \ - {tensor_parallel_size} for vllm.') - self.model_key = prepare_model( - model_type='vllm', - pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs) - self.sampling_params = vllm.SamplingParams(**sampling_params) - else: - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code) - self.sampling_params = sampling_params - - def process_single(self, sample=None, rank=None): - model, processor = get_model(self.model_key, rank=rank) - - messages = [{ - 'role': 'system', - 'content': self.system_prompt - }, { - 'role': 'user', - 'content': sample[self.query_key] - }] - input_prompt = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True) - - if self.enable_vllm: - response = model.generate([input_prompt], self.sampling_params) - output = response[0].outputs[0].text - else: - inputs = processor(input_prompt, - return_tensors='pt').to(model.device) - response = model.generate(**inputs, - eos_token_id=processor.eos_token_id, - **self.sampling_params) - output = processor.decode(response.cpu()[0], - skip_special_tokens=True) + super().__init__(hf_model=hf_model, + trust_remote_code=trust_remote_code, + system_prompt=system_prompt, + enable_vllm=enable_vllm, + tensor_parallel_size=tensor_parallel_size, + max_model_len=max_model_len, + max_num_seqs=max_num_seqs, + sampling_params=sampling_params, + **kwargs) - sample[self.query_key] = output + def build_input(self, sample): + return sample[self.query_key] - return sample + def parse_output(self, raw_output): + return raw_output.strip(), None From f2201e2e89a4ca79b29b35743c79ba2e9eaae25d Mon Sep 17 00:00:00 2001 From: null <3213204+drcege@users.noreply.github.com> Date: Thu, 24 Oct 2024 16:06:12 +0800 Subject: [PATCH 08/23] add tests --- data_juicer/ops/mapper/optimize_qa_mapper.py | 149 ++++++++++++++++++ .../ops/mapper/optimize_response_mapper.py | 69 ++++++++ tests/ops/mapper/test_optimize_qa_mapper.py | 39 +++++ .../ops/mapper/test_optimize_query_mapper.py | 20 +-- .../mapper/test_optimize_response_mapper.py | 37 +++++ 5 files changed, 304 insertions(+), 10 deletions(-) create mode 100644 data_juicer/ops/mapper/optimize_qa_mapper.py create mode 100644 data_juicer/ops/mapper/optimize_response_mapper.py create mode 100644 tests/ops/mapper/test_optimize_qa_mapper.py create mode 100644 tests/ops/mapper/test_optimize_response_mapper.py diff --git a/data_juicer/ops/mapper/optimize_qa_mapper.py b/data_juicer/ops/mapper/optimize_qa_mapper.py new file mode 100644 index 0000000000..ccdf6730cb --- /dev/null +++ b/data_juicer/ops/mapper/optimize_qa_mapper.py @@ -0,0 +1,149 @@ +import re +from typing import Dict, Optional + +from loguru import logger + +from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper +from data_juicer.utils.lazy_loader import LazyLoader +from data_juicer.utils.model_utils import get_model, prepare_model + +torch = LazyLoader('torch', 'torch') +vllm = LazyLoader('vllm', 'vllm') + +OP_NAME = 'optimize_qa_mapper' + + +# TODO: Extend LLM-based OPs into API-based implementation. +@UNFORKABLE.register_module(OP_NAME) +@OPERATORS.register_module(OP_NAME) +class OptimizeQAMapper(Mapper): + """ + Mapper to optimize question-answer pairs. + """ + + # avoid leading whitespace + DEFAULT_SYSTEM_PROMPT = ('请优化输入的问答对,使【问题】和【回答】都更加详细、准确。\n' + '按照以下格式输出:\n' + '【问题】\n' + '优化后的问题\n' + '【回答】\n' + '优化后的回答') + DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n\n{qa_pair}' + DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n' + DEFAULT_OUTPUT_PATTERN = r'【问题】\s*(.*?)\s*【回答】\s*(.*)' + + _accelerator = 'cuda' + + def __init__(self, + *, + hf_model: str = 'Qwen/Qwen-7B-Chat', + trust_remote_code: bool = False, + system_prompt: Optional[str] = None, + input_template: Optional[str] = None, + qa_pair_template: Optional[str] = None, + output_pattern: Optional[str] = None, + enable_vllm: bool = True, + tensor_parallel_size: Optional[int] = None, + max_model_len: Optional[int] = None, + max_num_seqs: int = 256, + sampling_params: Dict = {}, + **kwargs): + """ + Initialization method. + + :param hf_model: Hugging Face model ID. + :param trust_remote_code: Whether to trust remote code from the model + (passed to transformers). + :param system_prompt: System prompt for the optimization task. + :param input_template: Template for building the input for the model. + :param qa_pair_template: Template for formatting the question-answer + pair. + :param output_pattern: Pattern for parsing the output from the model. + :param enable_vllm: Whether to use VLLM for inference acceleration. + :param tensor_parallel_size: Number of GPUs for distributed execution, + valid only if VLLM is enabled. + :param max_model_len: Model context length, valid only if VLLM is + enabled. If unspecified, will be derived from the model config. + :param max_num_seqs: Max number of sequences to process at once, valid + only if VLLM is enabled. + :param sampling_params: Sampling parameters for text generation (e.g., + {'temperature': 0.9, 'top_p': 0.95}). + :param args: Extra positional arguments. + :param kwargs: Extra keyword arguments. + """ + super().__init__(**kwargs) + self.num_proc = 1 + + self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT + self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE + self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN + self.qa_pair_template = qa_pair_template or \ + self.DEFAULT_QA_PAIR_TEMPLATE + self.enable_vllm = enable_vllm + + if enable_vllm: + assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' + if not tensor_parallel_size: + tensor_parallel_size = torch.cuda.device_count() + logger.info(f'Set tensor_parallel_size to \ + {tensor_parallel_size} for vllm.') + self.model_key = prepare_model( + model_type='vllm', + pretrained_model_name_or_path=hf_model, + trust_remote_code=trust_remote_code, + tensor_parallel_size=tensor_parallel_size, + max_model_len=max_model_len, + max_num_seqs=max_num_seqs) + self.sampling_params = vllm.SamplingParams(**sampling_params) + else: + self.model_key = prepare_model( + model_type='huggingface', + pretrained_model_name_or_path=hf_model, + trust_remote_code=trust_remote_code) + self.sampling_params = sampling_params + + def build_input(self, sample): + qa_pair = self.qa_pair_pattern.format(sample[self.query_key], + sample[self.response_key]) + input_prompt = self.input_pattern.format(qa_pair) + return input_prompt + + def parse_output(self, raw_output): + match = re.match(self.output_pattern, raw_output) + if match: + return match.group(1).strip(), match.group(2).strip() + else: + return None, None + + def process_single(self, sample=None, rank=None): + model, processor = get_model(self.model_key, rank=rank) + + messages = [{ + 'role': 'system', + 'content': self.system_prompt + }, { + 'role': 'user', + 'content': self.build_input(sample) + }] + input_prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True) + + if self.enable_vllm: + response = model.generate([input_prompt], self.sampling_params) + output = response[0].outputs[0].text + else: + inputs = processor(input_prompt, + return_tensors='pt').to(model.device) + response = model.generate(**inputs, + eos_token_id=processor.eos_token_id, + **self.sampling_params) + output = processor.decode(response.cpu()[0], + skip_special_tokens=True) + + parsed_q, parsed_a = self.parse_output(output) + if parsed_q: + sample[self.query_key] = parsed_q + if parsed_a: + sample[self.response_key] = parsed_a + + return sample diff --git a/data_juicer/ops/mapper/optimize_response_mapper.py b/data_juicer/ops/mapper/optimize_response_mapper.py new file mode 100644 index 0000000000..cf4b067167 --- /dev/null +++ b/data_juicer/ops/mapper/optimize_response_mapper.py @@ -0,0 +1,69 @@ +from typing import Dict, Optional + +from data_juicer.ops.base_op import OPERATORS, UNFORKABLE +from data_juicer.ops.mapper import OptimizeQAMapper +from data_juicer.utils.lazy_loader import LazyLoader + +torch = LazyLoader('torch', 'torch') +vllm = LazyLoader('vllm', 'vllm') + +OP_NAME = 'optimize_response_mapper' + + +# TODO: Extend LLM-based OPs into API-based implementation. +@UNFORKABLE.register_module(OP_NAME) +@OPERATORS.register_module(OP_NAME) +class OptimizeResponseMapper(OptimizeQAMapper): + """ + Mapper to optimize only response in question-answer pairs. + """ + + DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。' + + _accelerator = 'cuda' + + def __init__(self, + *, + hf_model: str = 'Qwen/Qwen-7B-Chat', + trust_remote_code: bool = False, + system_prompt: Optional[str] = None, + enable_vllm: bool = True, + tensor_parallel_size: Optional[int] = None, + max_model_len: Optional[int] = None, + max_num_seqs: int = 256, + sampling_params: Dict = {}, + **kwargs): + """ + Initialization method. + :param hf_model: Hugginface model id. + :param trust_remote_code: passed to transformers + :param system_prompt: System prompt for optimize samples. + :param enable_vllm: Whether to use vllm for inference acceleration. + :param tensor_parallel_size: It is only valid when enable_vllm is True. + The number of GPUs to use for distributed execution with tensor + parallelism. + :param max_model_len: It is only valid when enable_vllm is True. + Model context length. If unspecified, will be automatically + derived from the model config. + :param max_num_seqs: It is only valid when enable_vllm is True. + Maximum number of sequences to be processed in a single iteration. + :param sampling_params: Sampling parameters for text generation. + e.g {'temperature': 0.9, 'top_p': 0.95} + :param args: extra args + :param kwargs: extra args + """ + super().__init__(hf_model=hf_model, + trust_remote_code=trust_remote_code, + system_prompt=system_prompt, + enable_vllm=enable_vllm, + tensor_parallel_size=tensor_parallel_size, + max_model_len=max_model_len, + max_num_seqs=max_num_seqs, + sampling_params=sampling_params, + **kwargs) + + def build_input(self, sample): + return sample[self.response_key] + + def parse_output(self, raw_output): + return None, raw_output.strip() diff --git a/tests/ops/mapper/test_optimize_qa_mapper.py b/tests/ops/mapper/test_optimize_qa_mapper.py new file mode 100644 index 0000000000..9933bbaee6 --- /dev/null +++ b/tests/ops/mapper/test_optimize_qa_mapper.py @@ -0,0 +1,39 @@ +import unittest +from loguru import logger +from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper +from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, + DataJuicerTestCaseBase) + +# Skip tests for this OP in the GitHub actions due to disk space limitation. +# These tests have been tested locally. +@SKIPPED_TESTS.register_module() +class OptimizeQAMapperTest(DataJuicerTestCaseBase): + query_key = 'query' + + def _run_op(self, enable_vllm=False): + op = OptimizeQAMapper( + enable_vllm=enable_vllm + ) + + samples = [{ + 'query': '鱼香肉丝怎么做?', + 'response': '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' + }] + + for sample in samples: + result = op.process(sample) + logger.info(f'Output results: {result}') + # Note: If switching models causes this assert to fail, it may not be a code issue; + # the model might just have limited capabilities. + self.assertNotEqual(result['query'], '') + self.assertNotEqual(result['response'], '') + + def test(self): + self._run_op() + + def test_vllm(self): + self._run_op(enable_vllm=True) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/ops/mapper/test_optimize_query_mapper.py b/tests/ops/mapper/test_optimize_query_mapper.py index ff35c6eacf..4fff9a3662 100644 --- a/tests/ops/mapper/test_optimize_query_mapper.py +++ b/tests/ops/mapper/test_optimize_query_mapper.py @@ -8,30 +8,30 @@ # These tests have been tested locally. @SKIPPED_TESTS.register_module() class OptimizeQueryMapperTest(DataJuicerTestCaseBase): - query_key = 'query' - def _run_optimize_instruction(self, enable_vllm=False): + def _run_op(self, enable_vllm=False): op = OptimizeQueryMapper( hf_model='alibaba-pai/Qwen2-7B-Instruct-Refine', enable_vllm=enable_vllm ) - samples = [ - {self.query_key: '鱼香肉丝怎么做?'} - ] + samples = [{ + 'query': '鱼香肉丝怎么做?', + 'response': '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' + }] for sample in samples: result = op.process(sample) logger.info(f'Output results: {result}') # Note: If switching models causes this assert to fail, it may not be a code issue; # the model might just have limited capabilities. - self.assertNotEqual(sample[op.query_key], '') + self.assertNotEqual(result['query'], '') - def test_optimize_instruction(self): - self._run_optimize_instruction() + def test(self): + self._run_op() - def test_optimize_instruction_vllm(self): - self._run_optimize_instruction(enable_vllm=True) + def test_vllm(self): + self._run_op(enable_vllm=True) if __name__ == '__main__': diff --git a/tests/ops/mapper/test_optimize_response_mapper.py b/tests/ops/mapper/test_optimize_response_mapper.py new file mode 100644 index 0000000000..6f1a0fe9f6 --- /dev/null +++ b/tests/ops/mapper/test_optimize_response_mapper.py @@ -0,0 +1,37 @@ +import unittest +from loguru import logger +from data_juicer.ops.mapper.optimize_response_mapper import OptimizeResponseMapper +from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, + DataJuicerTestCaseBase) + +# Skip tests for this OP in the GitHub actions due to disk space limitation. +# These tests have been tested locally. +@SKIPPED_TESTS.register_module() +class OptimizeResponseMapperTest(DataJuicerTestCaseBase): + + def _run_op(self, enable_vllm=False): + op = OptimizeResponseMapper( + enable_vllm=enable_vllm + ) + + samples = [{ + 'query': '鱼香肉丝怎么做?', + 'response': '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' + }] + + for sample in samples: + result = op.process(sample) + logger.info(f'Output results: {result}') + # Note: If switching models causes this assert to fail, it may not be a code issue; + # the model might just have limited capabilities. + self.assertNotEqual(result['response'], '') + + def test(self): + self._run_op() + + def test_vllm(self): + self._run_op(enable_vllm=True) + + +if __name__ == '__main__': + unittest.main() From 258cad2062aa1acdf63843baeaf50be037fea8fe Mon Sep 17 00:00:00 2001 From: null <3213204+drcege@users.noreply.github.com> Date: Thu, 24 Oct 2024 16:09:22 +0800 Subject: [PATCH 09/23] fix install mapping --- data_juicer/utils/auto_install_mapping.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/data_juicer/utils/auto_install_mapping.py b/data_juicer/utils/auto_install_mapping.py index d8e919cb02..94d9843e3d 100644 --- a/data_juicer/utils/auto_install_mapping.py +++ b/data_juicer/utils/auto_install_mapping.py @@ -21,7 +21,8 @@ 'video_watermark_filter', 'generate_qa_from_text_mapper', 'generate_qa_from_examples_mapper', 'image_captioning_mapper', 'image_diffusion_mapper', 'image_tagging_mapper', - 'optimize_query_mapper', 'video_captioning_from_frames_mapper', + 'optimize_query_mapper', 'optimize_response_mapper', + 'optimize_qa_mapper', 'video_captioning_from_frames_mapper', 'video_captioning_from_summarizer_mapper', 'video_captioning_from_video_mapper', 'video_tagging_from_audio_mapper', 'video_tagging_from_frames_mapper' @@ -58,7 +59,8 @@ 'video_frames_text_similarity_filter', 'video_nsfw_filter', 'generate_qa_from_text_mapper', 'generate_qa_from_examples_mapper', 'image_captioning_mapper', 'image_diffusion_mapper', - 'optimize_query_mapper', 'video_captioning_from_audio_mapper', + 'optimize_query_mapper', 'optimize_response_mapper', + 'optimize_qa_mapper', 'video_captioning_from_audio_mapper', 'video_captioning_from_frames_mapper', 'video_captioning_from_summarizer_mapper', 'video_captioning_from_video_mapper', 'video_tagging_from_audio_mapper' @@ -96,8 +98,11 @@ 'video_remove_watermark_mapper' ], 'vllm': [ - 'generate_qa_from_text_mapper', 'generate_qa_from_examples_mapper', - 'optimize_query_mapper' + 'generate_qa_from_text_mapper', + 'generate_qa_from_examples_mapper', + 'optimize_query_mapper', + 'optimize_response_mapper', + 'optimize_qa_mapper', ], 'rouge': ['generate_qa_from_examples_mapper'], 'ram': ['image_tagging_mapper', 'video_tagging_from_frames_mapper'] From 9c76b11959a28a5069136b63f82dbc8bfa58aab5 Mon Sep 17 00:00:00 2001 From: null <3213204+drcege@users.noreply.github.com> Date: Thu, 24 Oct 2024 16:34:59 +0800 Subject: [PATCH 10/23] fix subclasses --- configs/config_all.yaml | 6 +-- .../generate_qa_from_examples_mapper.py | 2 +- data_juicer/ops/mapper/optimize_qa_mapper.py | 8 ++-- .../ops/mapper/optimize_query_mapper.py | 45 ------------------- .../ops/mapper/optimize_response_mapper.py | 45 ------------------- 5 files changed, 8 insertions(+), 98 deletions(-) diff --git a/configs/config_all.yaml b/configs/config_all.yaml index 9554e6f706..8d772814f1 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -64,7 +64,7 @@ process: - generate_qa_from_text_mapper: # mapper to generate question and answer pairs from text. hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa' # model name on huggingface to extract question and answer pair. pattern: null # regular expression pattern to search for within text. - enable_vllm: true # Whether to use vllm for inference acceleration. + enable_vllm: false # Whether to use vllm for inference acceleration. tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism. max_model_len: null # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config. max_num_seqs: 256 # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration. @@ -79,7 +79,7 @@ process: qa_pair_template: null # Prompt template for generate question and answer pair description. Please make sure the template contains two "{}" to format question and answer. Default: '【问题】\n{}\n【回答】\n{}\n'. example_template: null # Prompt template for generate examples. Please make sure the template contains "{qa_pairs}", which corresponds to the question and answer pair description generated by param `qa_pair_template`. qa_extraction_pattern: null # Regular expression pattern for parsing question and answer from model response. - enable_vllm: true # Whether to use vllm for inference acceleration. + enable_vllm: false # Whether to use vllm for inference acceleration. tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism. max_model_len: null # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config. max_num_seqs: 256 # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration. @@ -147,7 +147,7 @@ process: replace_equivalent_num: false # whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法" - optimize_query_mapper: # optimize instruction query. hf_model: 'alibaba-pai/Qwen2-7B-Instruct-Refine' # model name on huggingface to optimize instruction - enable_vllm: true # whether to use vllm for inference acceleration. + enable_vllm: false # whether to use vllm for inference acceleration. tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism. max_model_len: null # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config. max_num_seqs: 256 # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration. diff --git a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py index 1a91751f8e..5ca5cbfb67 100644 --- a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py +++ b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py @@ -49,7 +49,7 @@ class GenerateQAFromExamplesMapper(Mapper): _accelerator = 'cuda' def __init__(self, - hf_model: str = 'Qwen/Qwen-7B-Chat', + hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', seed_file: str = '', instruct_num: PositiveInt = 3, trust_remote_code: bool = False, diff --git a/data_juicer/ops/mapper/optimize_qa_mapper.py b/data_juicer/ops/mapper/optimize_qa_mapper.py index ccdf6730cb..a79fcfde2a 100644 --- a/data_juicer/ops/mapper/optimize_qa_mapper.py +++ b/data_juicer/ops/mapper/optimize_qa_mapper.py @@ -36,7 +36,7 @@ class OptimizeQAMapper(Mapper): def __init__(self, *, - hf_model: str = 'Qwen/Qwen-7B-Chat', + hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', trust_remote_code: bool = False, system_prompt: Optional[str] = None, input_template: Optional[str] = None, @@ -103,9 +103,9 @@ def __init__(self, self.sampling_params = sampling_params def build_input(self, sample): - qa_pair = self.qa_pair_pattern.format(sample[self.query_key], - sample[self.response_key]) - input_prompt = self.input_pattern.format(qa_pair) + qa_pair = self.qa_pair_template.format(sample[self.query_key], + sample[self.response_key]) + input_prompt = self.input_template.format(qa_pair) return input_prompt def parse_output(self, raw_output): diff --git a/data_juicer/ops/mapper/optimize_query_mapper.py b/data_juicer/ops/mapper/optimize_query_mapper.py index 255f1eebc6..933564e51d 100644 --- a/data_juicer/ops/mapper/optimize_query_mapper.py +++ b/data_juicer/ops/mapper/optimize_query_mapper.py @@ -1,5 +1,3 @@ -from typing import Dict, Optional - from data_juicer.ops.base_op import OPERATORS, UNFORKABLE from data_juicer.ops.mapper import OptimizeQAMapper from data_juicer.utils.lazy_loader import LazyLoader @@ -22,48 +20,5 @@ class OptimizeQueryMapper(OptimizeQAMapper): _accelerator = 'cuda' - def __init__(self, - *, - hf_model: str = 'alibaba-pai/Qwen2-7B-Instruct-Refine', - trust_remote_code: bool = False, - system_prompt: Optional[str] = None, - enable_vllm: bool = True, - tensor_parallel_size: Optional[int] = None, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - sampling_params: Dict = {}, - **kwargs): - """ - Initialization method. - :param hf_model: Hugginface model id. - :param trust_remote_code: passed to transformers - :param system_prompt: System prompt for optimize samples. - :param enable_vllm: Whether to use vllm for inference acceleration. - :param tensor_parallel_size: It is only valid when enable_vllm is True. - The number of GPUs to use for distributed execution with tensor - parallelism. - :param max_model_len: It is only valid when enable_vllm is True. - Model context length. If unspecified, will be automatically - derived from the model config. - :param max_num_seqs: It is only valid when enable_vllm is True. - Maximum number of sequences to be processed in a single iteration. - :param sampling_params: Sampling parameters for text generation. - e.g {'temperature': 0.9, 'top_p': 0.95} - :param args: extra args - :param kwargs: extra args - """ - super().__init__(hf_model=hf_model, - trust_remote_code=trust_remote_code, - system_prompt=system_prompt, - enable_vllm=enable_vllm, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs, - sampling_params=sampling_params, - **kwargs) - - def build_input(self, sample): - return sample[self.query_key] - def parse_output(self, raw_output): return raw_output.strip(), None diff --git a/data_juicer/ops/mapper/optimize_response_mapper.py b/data_juicer/ops/mapper/optimize_response_mapper.py index cf4b067167..3e19196eef 100644 --- a/data_juicer/ops/mapper/optimize_response_mapper.py +++ b/data_juicer/ops/mapper/optimize_response_mapper.py @@ -1,5 +1,3 @@ -from typing import Dict, Optional - from data_juicer.ops.base_op import OPERATORS, UNFORKABLE from data_juicer.ops.mapper import OptimizeQAMapper from data_juicer.utils.lazy_loader import LazyLoader @@ -22,48 +20,5 @@ class OptimizeResponseMapper(OptimizeQAMapper): _accelerator = 'cuda' - def __init__(self, - *, - hf_model: str = 'Qwen/Qwen-7B-Chat', - trust_remote_code: bool = False, - system_prompt: Optional[str] = None, - enable_vllm: bool = True, - tensor_parallel_size: Optional[int] = None, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - sampling_params: Dict = {}, - **kwargs): - """ - Initialization method. - :param hf_model: Hugginface model id. - :param trust_remote_code: passed to transformers - :param system_prompt: System prompt for optimize samples. - :param enable_vllm: Whether to use vllm for inference acceleration. - :param tensor_parallel_size: It is only valid when enable_vllm is True. - The number of GPUs to use for distributed execution with tensor - parallelism. - :param max_model_len: It is only valid when enable_vllm is True. - Model context length. If unspecified, will be automatically - derived from the model config. - :param max_num_seqs: It is only valid when enable_vllm is True. - Maximum number of sequences to be processed in a single iteration. - :param sampling_params: Sampling parameters for text generation. - e.g {'temperature': 0.9, 'top_p': 0.95} - :param args: extra args - :param kwargs: extra args - """ - super().__init__(hf_model=hf_model, - trust_remote_code=trust_remote_code, - system_prompt=system_prompt, - enable_vllm=enable_vllm, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs, - sampling_params=sampling_params, - **kwargs) - - def build_input(self, sample): - return sample[self.response_key] - def parse_output(self, raw_output): return None, raw_output.strip() From 578973ede5806d8ce48159592cf88b55f9e693bf Mon Sep 17 00:00:00 2001 From: null <3213204+drcege@users.noreply.github.com> Date: Thu, 24 Oct 2024 16:51:24 +0800 Subject: [PATCH 11/23] fix import --- data_juicer/ops/mapper/optimize_query_mapper.py | 6 +----- data_juicer/ops/mapper/optimize_response_mapper.py | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/data_juicer/ops/mapper/optimize_query_mapper.py b/data_juicer/ops/mapper/optimize_query_mapper.py index 933564e51d..a12c505755 100644 --- a/data_juicer/ops/mapper/optimize_query_mapper.py +++ b/data_juicer/ops/mapper/optimize_query_mapper.py @@ -1,9 +1,5 @@ from data_juicer.ops.base_op import OPERATORS, UNFORKABLE -from data_juicer.ops.mapper import OptimizeQAMapper -from data_juicer.utils.lazy_loader import LazyLoader - -torch = LazyLoader('torch', 'torch') -vllm = LazyLoader('vllm', 'vllm') +from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper OP_NAME = 'optimize_query_mapper' diff --git a/data_juicer/ops/mapper/optimize_response_mapper.py b/data_juicer/ops/mapper/optimize_response_mapper.py index 3e19196eef..cb68cc4c42 100644 --- a/data_juicer/ops/mapper/optimize_response_mapper.py +++ b/data_juicer/ops/mapper/optimize_response_mapper.py @@ -1,9 +1,5 @@ from data_juicer.ops.base_op import OPERATORS, UNFORKABLE -from data_juicer.ops.mapper import OptimizeQAMapper -from data_juicer.utils.lazy_loader import LazyLoader - -torch = LazyLoader('torch', 'torch') -vllm = LazyLoader('vllm', 'vllm') +from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper OP_NAME = 'optimize_response_mapper' From c81d4f7ee08197c3e1f3bb92d9ebed7a36fd78cc Mon Sep 17 00:00:00 2001 From: null <3213204+drcege@users.noreply.github.com> Date: Thu, 24 Oct 2024 17:32:24 +0800 Subject: [PATCH 12/23] fix format --- data_juicer/ops/mapper/optimize_qa_mapper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data_juicer/ops/mapper/optimize_qa_mapper.py b/data_juicer/ops/mapper/optimize_qa_mapper.py index a79fcfde2a..cf09865bc7 100644 --- a/data_juicer/ops/mapper/optimize_qa_mapper.py +++ b/data_juicer/ops/mapper/optimize_qa_mapper.py @@ -28,8 +28,8 @@ class OptimizeQAMapper(Mapper): '优化后的问题\n' '【回答】\n' '优化后的回答') - DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n\n{qa_pair}' - DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n' + DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n{qa_pair}' + DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}' DEFAULT_OUTPUT_PATTERN = r'【问题】\s*(.*?)\s*【回答】\s*(.*)' _accelerator = 'cuda' @@ -105,7 +105,7 @@ def __init__(self, def build_input(self, sample): qa_pair = self.qa_pair_template.format(sample[self.query_key], sample[self.response_key]) - input_prompt = self.input_template.format(qa_pair) + input_prompt = self.input_template.format(qa_pair=qa_pair) return input_prompt def parse_output(self, raw_output): From 8996389cb285fa7168de410e85e43e1db70fd118 Mon Sep 17 00:00:00 2001 From: "gece.gc" Date: Mon, 28 Oct 2024 13:16:52 +0000 Subject: [PATCH 13/23] unify methods naming --- .../generate_qa_from_examples_mapper.py | 276 ++++++++---------- .../mapper/generate_qa_from_text_mapper.py | 108 +++---- data_juicer/ops/mapper/optimize_qa_mapper.py | 75 ++--- 3 files changed, 203 insertions(+), 256 deletions(-) diff --git a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py index 5ca5cbfb67..4ba27523e0 100644 --- a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py +++ b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py @@ -15,18 +15,6 @@ vllm = LazyLoader('vllm', 'vllm') rouge = LazyLoader('rouge', 'rouge') -DEFAULT_PROMPT_TEMPLATE = """ -请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。注意,新生成的【问题】和【回答】需要满足如下要求: -1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。 -2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。 -3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。 -4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。 -{augmented_data} -""" -QA_EXTRACTION_PATTERN = r'【问题】\s*(.*?)\s*【回答】\s*(.*?)\s*(?=【问题】|$)' -EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n\n{qa_pairs}' -QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n' - OP_NAME = 'generate_qa_from_examples_mapper' @@ -46,67 +34,66 @@ class GenerateQAFromExamplesMapper(Mapper): The number of samples generated is determined by the length of the empty dataset. """ + + DEFAULT_SYSTEM_PROMPT = ( + '请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。' + '注意,新生成的【问题】和【回答】需要满足如下要求:\n' + '1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。\n' + '2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。\n' + '3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。\n' + '4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。\n') + + DEFAULT_INPUT_TEMPLATE = '{examples}' + DEFAULT_EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n{qa_pairs}' + DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n' + DEFAULT_OUTPUT_PATTERN = r'【问题】\s*(.*?)\s*【回答】\s*(.*?)\s*(?=【问题】|$)' + _accelerator = 'cuda' def __init__(self, hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', + *, seed_file: str = '', - instruct_num: PositiveInt = 3, - trust_remote_code: bool = False, + example_num: PositiveInt = 3, similarity_threshold: float = 0.7, - prompt_template: Optional[str] = None, - qa_pair_template: Optional[str] = None, + system_prompt: Optional[str] = None, + input_template: Optional[str] = None, example_template: Optional[str] = None, - qa_extraction_pattern: Optional[str] = None, - enable_vllm: bool = True, - tensor_parallel_size: Optional[int] = None, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - sampling_params: Dict = {}, - *args, + qa_pair_template: Optional[str] = None, + output_pattern: Optional[str] = None, + enable_vllm: bool = False, + llm_params: Optional[Dict] = None, + sampling_params: Optional[Dict] = None, **kwargs): """ Initialization method. :param hf_model: Hugginface model id. :param seed_file: Seed file path, chatml format. - :param instruct_num: The number of instruction samples. - Randomly select N samples from "seed_file" and + :param example_num: The number of QA example. + Randomly select N examples from "seed_file" and put them into prompt as instruction samples. - :param trust_remote_code: passed to transformers :param similarity_threshold: The similarity score threshold between the generated samples and the seed samples. Range from 0 to 1. Samples with similarity score less than this threshold will be kept. - :param prompt_template: Prompt template for generate samples. - Please make sure the template contains "{augmented_data}", - which corresponds to the augmented samples. - :param qa_pair_template: Prompt template for generate question - and answer pair description. Please make sure the template - contains two "{}" to format question and answer. - Default: '【问题】\n{}\n【回答】\n{}\n'. - :param example_template: Prompt template for generate examples. - Please make sure the template contains "{qa_pairs}", which - corresponds to the question and answer pair description - generated by param `qa_pair_template`. - Default: '\n如下是一条示例数据:\n\n{qa_pairs}' - :param qa_extraction_pattern: Regular expression pattern for parsing + :param system_prompt: System prompt for the generation task. + :param input_template: Template for building the input for the model. + Please make sure the template contains "{examples}", which + will be filled by `example_num` formatted `example_template`. + :param example_template: Template for formatting each QA example. + :param qa_pair_template: Template for formatting one-round QA pair + within each QA example. Please make sure the template contains + two "{}" to format question and answer. + :param output_pattern: Regular expression pattern for parsing question and answer from model response. :param enable_vllm: Whether to use vllm for inference acceleration. - :param tensor_parallel_size: It is only valid when enable_vllm is True. - The number of GPUs to use for distributed execution with tensor - parallelism. - :param max_model_len: It is only valid when enable_vllm is True. - Model context length. If unspecified, will be automatically - derived from the model config. - :param max_num_seqs: It is only valid when enable_vllm is True. - Maximum number of sequences to be processed in a single iteration. + :param llm_params: LLM initialization parameters. :param sampling_params: Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} - :param args: extra args - :param kwargs: extra args + :param kwargs: Extra keyword arguments. """ - super().__init__(*args, **kwargs) + super().__init__(**kwargs) self.num_proc = 1 if not seed_file: @@ -114,93 +101,76 @@ def __init__(self, 'Please provide `seed_file` in chatml format.' 'Example: data-juicer/demos/data/demo-dataset-chatml.jsonl') - self.instruct_num = instruct_num + self.seed_file = seed_file + self.example_num = example_num self.similarity_threshold = similarity_threshold self.similarity_type = 'rouge_l' - if prompt_template is None: - prompt_template = DEFAULT_PROMPT_TEMPLATE - if qa_pair_template is None: - qa_pair_template = QA_PAIR_TEMPLATE - if example_template is None: - example_template = EXAMPLE_TEMPLATE - if qa_extraction_pattern is None: - qa_extraction_pattern = QA_EXTRACTION_PATTERN - - self.prompt_template = prompt_template - self.qa_pair_template = qa_pair_template - self.example_template = example_template - self.qa_extraction_pattern = qa_extraction_pattern + self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT + self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE + self.example_template = example_template or self.DEFAULT_EXAMPLE_TEMPLATE # noqa: E501 + self.qa_pair_template = qa_pair_template or \ + self.DEFAULT_QA_PAIR_TEMPLATE + self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN self.enable_vllm = enable_vllm + llm_params = llm_params or {} + sampling_params = sampling_params or {} if enable_vllm: - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if not tensor_parallel_size: - tensor_parallel_size = torch.cuda.device_count() - logger.info(f'Set tensor_parallel_size to \ - {tensor_parallel_size} for vllm.') + if llm_params.get('tensor_parallel_size', 1) > 1: + self.num_proc = 1 self.model_key = prepare_model( model_type='vllm', pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs) + **llm_params) self.sampling_params = vllm.SamplingParams(**sampling_params) else: self.model_key = prepare_model( model_type='huggingface', pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code) + return_pipe=True, + **llm_params) self.sampling_params = sampling_params - self.seed_qa_samples = self.load_seed_qa_samples(seed_file) - + self.seed_qa_samples = self._load_seed_qa_samples() if len(self.seed_qa_samples) == 0: raise ValueError('No QA data was parsed from the seed file!') - - self.reference_samples = [ - '\n'.join(['\n'.join(qa_pair) for qa_pair in qa_pairs]) + '\n' - for qa_pairs in self.seed_qa_samples + self.seed_qa_str = [ + self._sample_to_str(sample) for sample in self.seed_qa_samples ] - def load_seed_qa_samples(self, seed_file): + def _load_seed_qa_samples(self): """Load QA pairs from chatml format file.""" qa_samples = [] - with open(seed_file) as f: + with open(self.seed_file, encoding='utf-8') as f: lines = f.readlines() for line in lines: line = line.strip() - qa_pairs = self.parse_chatml_str(line) + qa_pairs = self._parse_chatml_str(line) if len(qa_pairs) > 0: qa_samples.append(qa_pairs) - return qa_samples - def build_prompt(self, qa_samples, prompt_template): + def _sample_to_str(self, qa_sample): + return '\n'.join(['\n'.join(qa_pair) for qa_pair in qa_sample]) + '\n' - def format_qa_pairs(qa_pairs): - return ''.join([ - self.qa_pair_template.format(q, a) for q, a in qa_pairs - if q and a - ]) - - body_fragments = [ - self.example_template.format(qa_pairs=format_qa_pairs(qa_pairs)) - for qa_pairs in qa_samples - ] - - body = ''.join(body_fragments) - - return prompt_template.format(augmented_data=body) + def _max_rouge_l_score(self, hypothesis): + r = rouge.Rouge() + max_score = 0.0 + for reference in self.seed_qa_str: + scores = r.get_scores(hypothesis, reference) + rouge_l_score = scores[0]['rouge-l']['f'] + if rouge_l_score > max_score: + max_score = rouge_l_score + return max_score - def parse_chatml_str(self, input_str): + def _parse_chatml_str(self, sample_str): user_input = None assistant_output = None qa_pairs = [] - data = json.loads(input_str) + data = json.loads(sample_str) for message in data['messages']: role = message['role'] content = message['content'] @@ -211,80 +181,86 @@ def parse_chatml_str(self, input_str): qa_pairs.append((user_input, assistant_output)) return qa_pairs - def parse_response(self, response_str): - logger.debug(response_str) - pattern = self.qa_extraction_pattern - matches = re.findall(pattern, response_str, re.DOTALL) - response_str = '' - out_qa_pairs = [] + def build_input(self, qa_examples): + + def format_qa_pairs(qa_example): + return ''.join([ + self.qa_pair_template.format(q, a) for q, a in qa_example + if q and a + ]) + + formatted_examples = ''.join([ + self.example_template.format(qa_pairs=format_qa_pairs(qa_example)) + for qa_example in qa_examples + ]) + input_prompt = self.input_template.format(examples=formatted_examples) + return input_prompt + + def parse_output(self, raw_output): + logger.debug(raw_output) + matches = re.findall(self.output_pattern, raw_output, re.DOTALL) + output_qa_pairs = [] for match in matches: question, answer = match question = question.strip() answer = answer.strip() - out_qa_pairs.append((question, answer)) - response_str += question + '\n' + answer + '\n' - - if len(out_qa_pairs) == 0: - logger.error('Parse model response error! ' - 'No data generated for the current response!') - - return out_qa_pairs, response_str - - def max_rouge_l_score(self, reference, candidates): - - r = rouge.Rouge() - max_score = 0.0 - for candidate in candidates: - scores = r.get_scores(candidate, reference) - rouge_l_score = scores[0]['rouge-l']['f'] - if rouge_l_score > max_score: - max_score = rouge_l_score - return max_score + output_qa_pairs.append((question, answer)) + return output_qa_pairs def process_single(self, sample=None, rank=None): - model, processor = get_model(self.model_key, rank=rank) + model, processor = get_model(self.model_key, rank, self.use_cuda()) random_qa_samples = random.sample(self.seed_qa_samples, - self.instruct_num) - input_prompt = self.build_prompt(random_qa_samples, - self.prompt_template) + self.example_num) + input_prompt = self.build_input(random_qa_samples) + + messages = [{ + 'role': 'system', + 'content': self.system_prompt + }, { + 'role': 'user', + 'content': input_prompt + }] + if self.enable_vllm: - response = model.generate([input_prompt], self.sampling_params) - response_str = response[0].outputs[0].text + response = model.chat(messages, self.sampling_params) + output = response[0].outputs[0].text else: - inputs = processor(input_prompt, - return_tensors='pt').to(model.device) - output_ids = model.generate(**inputs, **self.sampling_params) - # remove the input prompt from the output - output_ids = output_ids[:, inputs.data['input_ids'].shape[1]:] - response_str = processor.decode(output_ids.cpu()[0], - skip_special_tokens=True) - out_qa_pairs, response_str = self.parse_response(response_str) - - if not response_str: - return { + # model is pipe + response = model(messages, + return_full_text=False, + **self.sampling_params) + output = response[0]['generated_text'] + + output_qa_pairs = self.parse_output(output) + if len(output_qa_pairs) == 0: + logger.warning('Parse model response error! ' + 'No data generated for the current response!') + sample.update({ self.query_key: '', self.response_key: '', self.history_key: [] - } + }) + return sample if self.similarity_type == 'rouge_l': - sim_score = self.max_rouge_l_score(response_str, - self.reference_samples) + output_qa_str = self._sample_to_str(output_qa_pairs) + sim_score = self._max_rouge_l_score(output_qa_str) else: raise ValueError( f'Not support similarity type "{self.similarity_type}"!') if sim_score <= self.similarity_threshold: - query, response = out_qa_pairs[-1] - history = out_qa_pairs[:-1] + query, response = output_qa_pairs[-1] + history = output_qa_pairs[:-1] else: query = response = '' history = [] logger.info('Filter this generated sample due to similarity.') - return { + sample.update({ self.query_key: query, self.response_key: response, self.history_key: history - } + }) + return sample diff --git a/data_juicer/ops/mapper/generate_qa_from_text_mapper.py b/data_juicer/ops/mapper/generate_qa_from_text_mapper.py index 3c5c74df14..e856e0e79b 100644 --- a/data_juicer/ops/mapper/generate_qa_from_text_mapper.py +++ b/data_juicer/ops/mapper/generate_qa_from_text_mapper.py @@ -36,33 +36,23 @@ class GenerateQAFromTextMapper(Mapper): def __init__(self, hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', - trust_remote_code: bool = False, - pattern: Optional[str] = None, - enable_vllm: bool = True, - tensor_parallel_size: Optional[int] = None, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - sampling_params: Dict = {}, - *args, + *, + output_pattern: Optional[str] = None, + enable_vllm: bool = False, + llm_params: Optional[Dict] = None, + sampling_params: Optional[Dict] = None, **kwargs): """ Initialization method. + :param hf_model: Hugginface model id. - :param trust_remote_code: passed to transformers - :param pattern: regular expression pattern to search for within text. + :param output_pattern: Regular expression pattern for parsing + question and answer from model response. :param enable_vllm: Whether to use vllm for inference acceleration. - :param tensor_parallel_size: It is only valid when enable_vllm is True. - The number of GPUs to use for distributed execution with tensor - parallelism. - :param max_model_len: It is only valid when enable_vllm is True. - Model context length. If unspecified, will be automatically - derived from the model config. - :param max_num_seqs: It is only valid when enable_vllm is True. - Maximum number of sequences to be processed in a single iteration. + :param llm_params: LLM initialization parameters. :param sampling_params: Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} - :param args: extra args - :param kwargs: extra args + :param kwargs: Extra keyword arguments. The default data format parsed by this interface is as follows: Model Input: @@ -78,44 +68,40 @@ def __init__(self, ... """ - super().__init__(*args, **kwargs) - self.num_proc = 1 + super().__init__(**kwargs) - if pattern is None: - self.pattern = r'Human: (.*?)\nAssistant: (.*?)(?=\nHuman|$)' + if output_pattern is None: + self.output_pattern = r'Human: (.*?)\nAssistant: (.*?)(?=\nHuman|$)' # noqa: E501 else: - self.pattern = pattern + self.output_pattern = output_pattern self.enable_vllm = enable_vllm + llm_params = llm_params or {} + sampling_params = sampling_params or {} if enable_vllm: - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if not tensor_parallel_size: - tensor_parallel_size = torch.cuda.device_count() - logger.info(f'Set tensor_parallel_size to \ - {tensor_parallel_size} for vllm.') + if llm_params.get('tensor_parallel_size', 1) > 1: + self.num_proc = 1 self.model_key = prepare_model( model_type='vllm', pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs) + **llm_params) self.sampling_params = vllm.SamplingParams(**sampling_params) else: self.model_key = prepare_model( model_type='huggingface', pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code) + return_pipe=True, + **llm_params) self.sampling_params = sampling_params - def _extract_qa(self, output): + def parse_output(self, raw_output): """Extract qestion and answer pair from model output response.""" qa_list = [] - pat = re.compile(self.pattern, re.DOTALL) - qa_pairs = pat.findall(output) + pat = re.compile(self.output_pattern, re.DOTALL) + qa_pairs = pat.findall(raw_output) for qa in qa_pairs: user, assistant = qa @@ -126,36 +112,34 @@ def _extract_qa(self, output): def process_batched(self, samples, rank=None): model, processor = get_model(self.model_key, rank, self.use_cuda()) - keys = samples.keys() - first_key = next(iter(keys)) - num_samples = len(samples[first_key]) - out_samples = { - key: [] - for key in keys | {self.query_key, self.response_key} - } + input_keys = samples.keys() + num_samples = len(samples[next(iter(input_keys))]) + output_keys = input_keys | {self.query_key, self.response_key} + output_samples = {key: [] for key in output_keys} + for i in range(num_samples): - sample = {key: samples[key][i] for key in keys} + messages = [{'role': 'user', 'content': samples[self.text_key][i]}] + if self.enable_vllm: - response = model.generate([sample[self.text_key]], - self.sampling_params) + response = model.chat(messages, self.sampling_params) output = response[0].outputs[0].text else: - inputs = processor(sample[self.text_key], - return_tensors='pt').to(model.device) - response = model.generate(**inputs, **self.sampling_params) - output = processor.decode(response.cpu()[0], - skip_special_tokens=True) - - qa_list = self._extract_qa(output) + # model is pipe + response = model(messages, + return_full_text=False, + **self.sampling_params) + output = response[0]['generated_text'] + qa_list = self.parse_output(output) if len(qa_list) > 0: for q, a in qa_list: - for k, v in sample.items(): - out_samples[k].append(v) - out_samples[self.query_key].append(q) - out_samples[self.response_key].append(a) + for input_k in input_keys: + output_samples[input_k].append(samples[input_k][i]) + output_samples[self.query_key].append(q) + output_samples[self.response_key].append(a) else: - logger.info( - 'No question and answer was extracted from this sample!') + logger.warning( + 'No question and answer was extracted from current sample!' + ) - return out_samples + return output_samples diff --git a/data_juicer/ops/mapper/optimize_qa_mapper.py b/data_juicer/ops/mapper/optimize_qa_mapper.py index cf09865bc7..bba3234aa1 100644 --- a/data_juicer/ops/mapper/optimize_qa_mapper.py +++ b/data_juicer/ops/mapper/optimize_qa_mapper.py @@ -1,8 +1,6 @@ import re from typing import Dict, Optional -from loguru import logger - from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model @@ -35,71 +33,63 @@ class OptimizeQAMapper(Mapper): _accelerator = 'cuda' def __init__(self, - *, hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', - trust_remote_code: bool = False, + *, system_prompt: Optional[str] = None, input_template: Optional[str] = None, qa_pair_template: Optional[str] = None, output_pattern: Optional[str] = None, - enable_vllm: bool = True, - tensor_parallel_size: Optional[int] = None, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - sampling_params: Dict = {}, + enable_vllm: bool = False, + llm_params: Dict = None, + sampling_params: Dict = None, **kwargs): """ Initialization method. :param hf_model: Hugging Face model ID. - :param trust_remote_code: Whether to trust remote code from the model - (passed to transformers). :param system_prompt: System prompt for the optimization task. :param input_template: Template for building the input for the model. - :param qa_pair_template: Template for formatting the question-answer - pair. - :param output_pattern: Pattern for parsing the output from the model. + Please make sure the template contains "{qa_pair}", which + corresponds to the question and answer pair generated by + param `qa_pair_template`. + :param qa_pair_template: Template for formatting the question and + answer pair. Please make sure the template contains two + "{}" to format question and answer. + :param output_pattern: Regular expression pattern for parsing + question and answer from model response. :param enable_vllm: Whether to use VLLM for inference acceleration. - :param tensor_parallel_size: Number of GPUs for distributed execution, - valid only if VLLM is enabled. - :param max_model_len: Model context length, valid only if VLLM is - enabled. If unspecified, will be derived from the model config. - :param max_num_seqs: Max number of sequences to process at once, valid - only if VLLM is enabled. + :param llm_params: LLM initialization parameters. :param sampling_params: Sampling parameters for text generation (e.g., {'temperature': 0.9, 'top_p': 0.95}). - :param args: Extra positional arguments. :param kwargs: Extra keyword arguments. """ super().__init__(**kwargs) - self.num_proc = 1 self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE - self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN self.qa_pair_template = qa_pair_template or \ self.DEFAULT_QA_PAIR_TEMPLATE + self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN + self.enable_vllm = enable_vllm + llm_params = llm_params or {} + sampling_params = sampling_params or {} if enable_vllm: assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if not tensor_parallel_size: - tensor_parallel_size = torch.cuda.device_count() - logger.info(f'Set tensor_parallel_size to \ - {tensor_parallel_size} for vllm.') + if llm_params.get('tensor_parallel_size', 1) > 1: + self.num_proc = 1 self.model_key = prepare_model( model_type='vllm', pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs) + **llm_params) self.sampling_params = vllm.SamplingParams(**sampling_params) else: self.model_key = prepare_model( model_type='huggingface', pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code) + return_pipe=False, + **llm_params) self.sampling_params = sampling_params def build_input(self, sample): @@ -116,29 +106,26 @@ def parse_output(self, raw_output): return None, None def process_single(self, sample=None, rank=None): - model, processor = get_model(self.model_key, rank=rank) + model, processor = get_model(self.model_key, rank, self.use_cuda()) + input_prompt = self.build_input(sample) messages = [{ 'role': 'system', 'content': self.system_prompt }, { 'role': 'user', - 'content': self.build_input(sample) + 'content': input_prompt }] - input_prompt = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True) if self.enable_vllm: - response = model.generate([input_prompt], self.sampling_params) + response = model.chat(messages, self.sampling_params) output = response[0].outputs[0].text else: - inputs = processor(input_prompt, - return_tensors='pt').to(model.device) - response = model.generate(**inputs, - eos_token_id=processor.eos_token_id, - **self.sampling_params) - output = processor.decode(response.cpu()[0], - skip_special_tokens=True) + # model is pipe + response = model(messages, + return_full_text=False, + **self.sampling_params) + output = response[0]['generated_text'] parsed_q, parsed_a = self.parse_output(output) if parsed_q: From aaa15c35a02bd45cdb988c3414e0d8a781fa496a Mon Sep 17 00:00:00 2001 From: "gece.gc" Date: Mon, 28 Oct 2024 13:17:43 +0000 Subject: [PATCH 14/23] unify tests --- .../test_generate_qa_from_examples_mapper.py | 31 ++++++------ .../test_generate_qa_from_text_mapper.py | 49 +++++++++---------- tests/ops/mapper/test_optimize_qa_mapper.py | 18 ++++--- .../ops/mapper/test_optimize_query_mapper.py | 14 ++++-- .../mapper/test_optimize_response_mapper.py | 14 ++++-- 5 files changed, 73 insertions(+), 53 deletions(-) diff --git a/tests/ops/mapper/test_generate_qa_from_examples_mapper.py b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py index 90e25a787f..80568ba42c 100644 --- a/tests/ops/mapper/test_generate_qa_from_examples_mapper.py +++ b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py @@ -9,16 +9,15 @@ # These tests have been tested locally. @SKIPPED_TESTS.register_module() class GenerateQAFromExamplesMapperTest(DataJuicerTestCaseBase): - text_key = 'text' - def _run_generate_instruction(self, enable_vllm=False): + def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): op = GenerateQAFromExamplesMapper( - hf_model='Qwen/Qwen-7B-Chat', seed_file='demos/data/demo-dataset-chatml.jsonl', - instruct_num=2, - trust_remote_code=True, - enable_vllm=enable_vllm + example_num=3, + enable_vllm=enable_vllm, + llm_params=llm_params, + sampling_params=sampling_params, ) from data_juicer.format.empty_formatter import EmptyFormatter @@ -30,14 +29,18 @@ def _run_generate_instruction(self, enable_vllm=False): logger.info(row) # Note: If switching models causes this assert to fail, it may not be a code issue; # the model might just have limited capabilities. - self.assertNotEqual(row[op.query_key], '') - self.assertNotEqual(row[op.response_key], '') - - def test_generate_instruction(self): - self._run_generate_instruction() - - def test_generate_instruction_vllm(self): - self._run_generate_instruction(enable_vllm=True) + self.assertIn(op.query_key, row) + self.assertIn(op.response_key, row) + + def test(self): + sampling_params = {"max_new_tokens": 200} + self._run_op(sampling_params=sampling_params) + + def test_vllm(self): + import torch + llm_params = {"tensor_parallel_size": torch.cuda.device_count()} + sampling_params = {"max_tokens": 200} + self._run_op(enable_vllm=True, llm_params=llm_params, sampling_params=sampling_params) if __name__ == '__main__': diff --git a/tests/ops/mapper/test_generate_qa_from_text_mapper.py b/tests/ops/mapper/test_generate_qa_from_text_mapper.py index e9e822ab23..3a0d99402e 100644 --- a/tests/ops/mapper/test_generate_qa_from_text_mapper.py +++ b/tests/ops/mapper/test_generate_qa_from_text_mapper.py @@ -11,40 +11,39 @@ class GenerateQAFromTextMapperTest(DataJuicerTestCaseBase): text_key = 'text' - def _run_extract_qa(self, samples, enable_vllm=False, sampling_params={}, **kwargs): + def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): op = GenerateQAFromTextMapper( - hf_model='alibaba-pai/pai-qwen1_5-7b-doc2qa', - trust_remote_code=True, enable_vllm=enable_vllm, - sampling_params=sampling_params, - **kwargs) + llm_params=llm_params, + sampling_params=sampling_params) + + samples = [{ + self.text_key: '蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n' + }] + dataset = Dataset.from_list(samples) - dataset = dataset.map(op.process, batch_size=2) + dataset = dataset.map(op.process) + for row in dataset: logger.info(row) # Note: If switching models causes this assert to fail, it may not be a code issue; # the model might just have limited capabilities. - self.assertNotEqual(row[op.query_key], '') - self.assertNotEqual(row[op.response_key], '') + self.assertIn(op.query_key, row) + self.assertIn(op.response_key, row) - def test_extract_qa(self): - samples = [ - { - self.text_key: '蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n' - }] - self._run_extract_qa(samples) + def test(self): + sampling_params = {"max_new_tokens": 200} + self._run_op(sampling_params=sampling_params) - def test_extract_qa_vllm(self): - samples = [ - { - self.text_key: '蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n' - }] - self._run_extract_qa( - samples, - enable_vllm=True, - max_model_len=1024, - max_num_seqs=16, - sampling_params={'temperature': 0.9, 'top_p': 0.95, 'max_tokens': 256}) + def test_vllm(self): + import torch + llm_params = { + "tensor_parallel_size": torch.cuda.device_count(), + "max_model_len": 1024, + "max_num_seqs": 16 + } + sampling_params={'temperature': 0.9, 'top_p': 0.95, 'max_tokens': 200} + self._run_op(enable_vllm=True, llm_params=llm_params,sampling_params=sampling_params) if __name__ == '__main__': diff --git a/tests/ops/mapper/test_optimize_qa_mapper.py b/tests/ops/mapper/test_optimize_qa_mapper.py index 9933bbaee6..803ec87843 100644 --- a/tests/ops/mapper/test_optimize_qa_mapper.py +++ b/tests/ops/mapper/test_optimize_qa_mapper.py @@ -10,9 +10,11 @@ class OptimizeQAMapperTest(DataJuicerTestCaseBase): query_key = 'query' - def _run_op(self, enable_vllm=False): + def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): op = OptimizeQAMapper( - enable_vllm=enable_vllm + enable_vllm=enable_vllm, + llm_params=llm_params, + sampling_params=sampling_params, ) samples = [{ @@ -23,16 +25,20 @@ def _run_op(self, enable_vllm=False): for sample in samples: result = op.process(sample) logger.info(f'Output results: {result}') - # Note: If switching models causes this assert to fail, it may not be a code issue; + # Note: If switching models causes this assert to fail, it may not be a code issue; # the model might just have limited capabilities. self.assertNotEqual(result['query'], '') self.assertNotEqual(result['response'], '') - + def test(self): - self._run_op() + sampling_params = {"max_new_tokens": 200} + self._run_op(sampling_params=sampling_params) def test_vllm(self): - self._run_op(enable_vllm=True) + import torch + llm_params = {"tensor_parallel_size": torch.cuda.device_count()} + sampling_params = {"max_tokens": 200} + self._run_op(enable_vllm=True, llm_params=llm_params, sampling_params=sampling_params) if __name__ == '__main__': diff --git a/tests/ops/mapper/test_optimize_query_mapper.py b/tests/ops/mapper/test_optimize_query_mapper.py index 4fff9a3662..c653e39ea8 100644 --- a/tests/ops/mapper/test_optimize_query_mapper.py +++ b/tests/ops/mapper/test_optimize_query_mapper.py @@ -9,10 +9,12 @@ @SKIPPED_TESTS.register_module() class OptimizeQueryMapperTest(DataJuicerTestCaseBase): - def _run_op(self, enable_vllm=False): + def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): op = OptimizeQueryMapper( hf_model='alibaba-pai/Qwen2-7B-Instruct-Refine', - enable_vllm=enable_vllm + enable_vllm=enable_vllm, + llm_params=llm_params, + sampling_params=sampling_params ) samples = [{ @@ -28,10 +30,14 @@ def _run_op(self, enable_vllm=False): self.assertNotEqual(result['query'], '') def test(self): - self._run_op() + sampling_params = {"max_new_tokens": 200} + self._run_op(sampling_params=sampling_params) def test_vllm(self): - self._run_op(enable_vllm=True) + import torch + llm_params = {"tensor_parallel_size": torch.cuda.device_count()} + sampling_params = {"max_tokens": 200} + self._run_op(enable_vllm=True, llm_params=llm_params, sampling_params=sampling_params) if __name__ == '__main__': diff --git a/tests/ops/mapper/test_optimize_response_mapper.py b/tests/ops/mapper/test_optimize_response_mapper.py index 6f1a0fe9f6..5ae8d8bf6c 100644 --- a/tests/ops/mapper/test_optimize_response_mapper.py +++ b/tests/ops/mapper/test_optimize_response_mapper.py @@ -9,9 +9,11 @@ @SKIPPED_TESTS.register_module() class OptimizeResponseMapperTest(DataJuicerTestCaseBase): - def _run_op(self, enable_vllm=False): + def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): op = OptimizeResponseMapper( - enable_vllm=enable_vllm + enable_vllm=enable_vllm, + llm_params=llm_params, + sampling_params=sampling_params ) samples = [{ @@ -27,10 +29,14 @@ def _run_op(self, enable_vllm=False): self.assertNotEqual(result['response'], '') def test(self): - self._run_op() + sampling_params = {"max_new_tokens": 200} + self._run_op(sampling_params=sampling_params) def test_vllm(self): - self._run_op(enable_vllm=True) + import torch + llm_params = {"tensor_parallel_size": torch.cuda.device_count()} + sampling_params = {"max_tokens": 200} + self._run_op(enable_vllm=True, llm_params=llm_params, sampling_params=sampling_params) if __name__ == '__main__': From 01975c86118a660e4fd528738d099d0a27142a7e Mon Sep 17 00:00:00 2001 From: "gece.gc" Date: Tue, 29 Oct 2024 02:07:12 +0000 Subject: [PATCH 15/23] update model --- data_juicer/utils/model_utils.py | 569 +++++++++--------- .../test_generate_qa_from_examples_mapper.py | 24 +- .../test_generate_qa_from_text_mapper.py | 43 +- tests/ops/mapper/test_optimize_qa_mapper.py | 26 +- .../ops/mapper/test_optimize_query_mapper.py | 31 +- .../mapper/test_optimize_response_mapper.py | 41 +- 6 files changed, 390 insertions(+), 344 deletions(-) diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py index cda046b812..7c6d59ba19 100644 --- a/data_juicer/utils/model_utils.py +++ b/data_juicer/utils/model_utils.py @@ -105,6 +105,67 @@ def check_model(model_name, force=False): return cached_model_path +def prepare_diffusion_model(pretrained_model_name_or_path, + diffusion_type, + torch_dtype='fp32', + revision='main', + trust_remote_code=False): + """ + Prepare and load an Diffusion model from HuggingFace. + + :param pretrained_model_name_or_path: input Diffusion model name + or local path to the model + :param diffusion_type: the use of the diffusion model. It can be + 'image2image', 'text2image', 'inpainting' + :param torch_dtype: the floating point to load the diffusion + model. Can be one of ['fp32', 'fp16', 'bf16'] + :param revision: The specific model version to use. It can be a + branch name, a tag name, a commit id, or any identifier allowed + by Git. + :return: a Diffusion model. + """ + AUTOINSTALL.check(['torch', 'transformers']) + + diffusion_type_to_pipeline = { + 'image2image': diffusers.AutoPipelineForImage2Image, + 'text2image': diffusers.AutoPipelineForText2Image, + 'inpainting': diffusers.AutoPipelineForInpainting + } + + if diffusion_type not in diffusion_type_to_pipeline.keys(): + raise ValueError( + f'Not support {diffusion_type} diffusion_type for diffusion ' + 'model. Can only be one of ' + '["image2image", "text2image", "inpainting"].') + + if torch_dtype not in ['fp32', 'fp16', 'bf16']: + raise ValueError( + f'Not support {torch_dtype} torch_dtype for diffusion ' + 'model. Can only be one of ' + '["fp32", "fp16", "bf16"].') + + if not is_cuda_available() and (torch_dtype == 'fp16' + or torch_dtype == 'bf16'): + raise ValueError( + 'In cpu mode, only fp32 torch_dtype can be used for diffusion' + ' model.') + + pipeline = diffusion_type_to_pipeline[diffusion_type] + if torch_dtype == 'bf16': + torch_dtype = torch.bfloat16 + elif torch_dtype == 'fp16': + torch_dtype = torch.float16 + else: + torch_dtype = torch.float32 + + model = pipeline.from_pretrained(pretrained_model_name_or_path, + revision=revision, + torch_dtype=torch_dtype, + trust_remote_code=trust_remote_code) + + return model + + def prepare_fasttext_model(model_name='lid.176.bin'): """ Prepare and load a fasttext model. @@ -120,33 +181,58 @@ def prepare_fasttext_model(model_name='lid.176.bin'): return ft_model -def prepare_sentencepiece_model(model_path): +def prepare_huggingface_model(pretrained_model_name_or_path, + *, + return_model=True, + return_pipe=False, + pipe_task='text-generation', + **llm_params): """ - Prepare and load a sentencepiece model. + Prepare and load a HuggingFace model with the correspoding processor. - :param model_path: input model path - :return: model instance + :param pretrained_model_name_or_path: model name or path + :param return_model: return model or not + :param return_pipe: whether to wrap model into pipeline + :param llm_params: LLM initialization parameters. + :return: a tuple of (model, input processor) if `return_model` is True; + otherwise, only the processor is returned. """ - logger.info('Loading sentencepiece model...') - sentencepiece_model = sentencepiece.SentencePieceProcessor() - try: - sentencepiece_model.load(check_model(model_path)) - except: # noqa: E722 - sentencepiece_model.load(check_model(model_path, force=True)) - return sentencepiece_model + # require torch for transformer model + AUTOINSTALL.check(['torch']) + processor = transformers.AutoProcessor.from_pretrained( + pretrained_model_name_or_path, **llm_params) -def prepare_sentencepiece_for_lang(lang, name_pattern='{}.sp.model'): - """ - Prepare and load a sentencepiece model for specific langauge. + if return_model: + config = transformers.AutoConfig.from_pretrained( + pretrained_model_name_or_path, **llm_params) + if hasattr(config, 'auto_map'): + class_name = next( + (k for k in config.auto_map if k.startswith('AutoModel')), + 'AutoModel') + else: + # TODO: What happens if more than one + class_name = config.architectures[0] - :param lang: language to render model name - :param name_pattern: pattern to render the model name - :return: model instance. - """ + model_class = getattr(transformers, class_name) + model = model_class.from_pretrained(pretrained_model_name_or_path, + **llm_params) + + if return_pipe: + if isinstance(processor, transformers.PreTrainedTokenizerBase): + pipe_param = {'tokenizer': processor} + elif isinstance(processor, transformers.SequenceFeatureExtractor): + pipe_param = {'feature_extractor': processor} + elif isinstance(processor, transformers.BaseImageProcessor): + pipe_param = {'image_processor': processor} + pipe = transformers.pipeline(task=pipe_task, + model=model, + config=config, + device='cpu', + **pipe_param) + model = pipe - model_name = name_pattern.format(lang) - return prepare_sentencepiece_model(model_name) + return (model, processor) if return_model else processor def prepare_kenlm_model(lang, name_pattern='{}.arpa.bin'): @@ -194,6 +280,159 @@ def prepare_nltk_model(lang, name_pattern='punkt.{}.pickle'): return nltk_model +def prepare_opencv_classifier(model_path): + model = cv2.CascadeClassifier(model_path) + return model + + +def prepare_recognizeAnything_model( + pretrained_model_name_or_path='ram_plus_swin_large_14m.pth', + input_size=384): + """ + Prepare and load recognizeAnything model. + + :param model_name: input model name. + :param input_size: the input size of the model. + """ + logger.info('Loading recognizeAnything model...') + try: + model = ram.ram_plus( + pretrained=check_model(pretrained_model_name_or_path), + image_size=input_size, + vit='swin_l') + except (RuntimeError, UnpicklingError) as e: # noqa: E722 + logger.warning(e) + model = ram.ram_plus(pretrained=check_model( + pretrained_model_name_or_path, force=True), + image_size=input_size, + vit='swin_l') + model.eval() + return model + + +def prepare_sentencepiece_model(model_path): + """ + Prepare and load a sentencepiece model. + + :param model_path: input model path + :return: model instance + """ + logger.info('Loading sentencepiece model...') + sentencepiece_model = sentencepiece.SentencePieceProcessor() + try: + sentencepiece_model.load(check_model(model_path)) + except: # noqa: E722 + sentencepiece_model.load(check_model(model_path, force=True)) + return sentencepiece_model + + +def prepare_sentencepiece_for_lang(lang, name_pattern='{}.sp.model'): + """ + Prepare and load a sentencepiece model for specific langauge. + + :param lang: language to render model name + :param name_pattern: pattern to render the model name + :return: model instance. + """ + + model_name = name_pattern.format(lang) + return prepare_sentencepiece_model(model_name) + + +def prepare_simple_aesthetics_model(pretrained_model_name_or_path, + return_model=True, + trust_remote_code=False): + """ + Prepare and load a simple aesthetics model. + + :param pretrained_model_name_or_path: model name or path + :param return_model: return model or not + :return: a tuple (model, input processor) if `return_model` is True; + otherwise, only the processor is returned. + """ + processor = transformers.CLIPProcessor.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code) + if not return_model: + return processor + else: + if 'v1' in pretrained_model_name_or_path: + model = aes_pre.AestheticsPredictorV1.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code) + elif ('v2' in pretrained_model_name_or_path + and 'linear' in pretrained_model_name_or_path): + model = aes_pre.AestheticsPredictorV2Linear.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code) + elif ('v2' in pretrained_model_name_or_path + and 'relu' in pretrained_model_name_or_path): + model = aes_pre.AestheticsPredictorV2ReLU.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code) + else: + raise ValueError( + 'Not support {}'.format(pretrained_model_name_or_path)) + return (model, processor) + + +def prepare_spacy_model(lang, name_pattern='{}_core_web_md-3.7.0'): + """ + Prepare spacy model for specific language. + + :param lang: language of sapcy model. Should be one of ["zh", + "en"] + :return: corresponding spacy model + """ + import spacy + + assert lang in ['zh', 'en'], 'Diversity only support zh and en' + model_name = name_pattern.format(lang) + logger.info(f'Loading spacy model [{model_name}]...') + compressed_model = '{}.tar.gz'.format(model_name) + + # decompress the compressed model if it's not decompressed + def decompress_model(compressed_model_path): + if not compressed_model_path.endswith('.tar.gz'): + raise ValueError('Only .tar.gz files are supported') + + decompressed_model_path = compressed_model_path.replace('.tar.gz', '') + if os.path.exists(decompressed_model_path) \ + and os.path.isdir(decompressed_model_path): + return decompressed_model_path + + ver_name = os.path.basename(decompressed_model_path) + unver_name = ver_name.rsplit('-', maxsplit=1)[0] + target_dir_in_archive = f'{ver_name}/{unver_name}/{ver_name}/' + + import tarfile + with tarfile.open(compressed_model_path, 'r:gz') as tar: + for member in tar.getmembers(): + if member.name.startswith(target_dir_in_archive): + # relative path without unnecessary directory levels + relative_path = os.path.relpath( + member.name, start=target_dir_in_archive) + target_path = os.path.join(decompressed_model_path, + relative_path) + + if member.isfile(): + # ensure the directory exists + target_directory = os.path.dirname(target_path) + os.makedirs(target_directory, exist_ok=True) + # for files, extract to the specific location + with tar.extractfile(member) as source: + with open(target_path, 'wb') as target: + target.write(source.read()) + return decompressed_model_path + + try: + diversity_model = spacy.load( + decompress_model(check_model(compressed_model))) + except: # noqa: E722 + diversity_model = spacy.load( + decompress_model(check_model(compressed_model, force=True))) + return diversity_model + + def prepare_video_blip_model(pretrained_model_name_or_path, return_model=True, trust_remote_code=False): @@ -333,275 +572,35 @@ def __init__(self, config: transformers.Blip2Config) -> None: return (model, processor) if return_model else processor -def prepare_simple_aesthetics_model(pretrained_model_name_or_path, - return_model=True, - trust_remote_code=False): - """ - Prepare and load a simple aesthetics model. - - :param pretrained_model_name_or_path: model name or path - :param return_model: return model or not - :return: a tuple (model, input processor) if `return_model` is True; - otherwise, only the processor is returned. - """ - processor = transformers.CLIPProcessor.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code) - if not return_model: - return processor - else: - if 'v1' in pretrained_model_name_or_path: - model = aes_pre.AestheticsPredictorV1.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code) - elif ('v2' in pretrained_model_name_or_path - and 'linear' in pretrained_model_name_or_path): - model = aes_pre.AestheticsPredictorV2Linear.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code) - elif ('v2' in pretrained_model_name_or_path - and 'relu' in pretrained_model_name_or_path): - model = aes_pre.AestheticsPredictorV2ReLU.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code) - else: - raise ValueError( - 'Not support {}'.format(pretrained_model_name_or_path)) - return (model, processor) - - -def prepare_huggingface_model(pretrained_model_name_or_path, - return_model=True, - trust_remote_code=False): +def prepare_vllm_model(pretrained_model_name_or_path, **llm_params): """ Prepare and load a HuggingFace model with the correspoding processor. :param pretrained_model_name_or_path: model name or path - :param return_model: return model or not - :param trust_remote_code: passed to transformers - :return: a tuple (model, input processor) if `return_model` is True; - otherwise, only the processor is returned. - """ - # require torch for transformer model - AUTOINSTALL.check(['torch']) - - processor = transformers.AutoProcessor.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code) - - if return_model: - config = transformers.AutoConfig.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code) - if hasattr(config, 'auto_map'): - class_name = next( - (k for k in config.auto_map if k.startswith('AutoModel')), - 'AutoModel') - else: - # TODO: What happens if more than one - class_name = config.architectures[0] - - model_class = getattr(transformers, class_name) - model = model_class.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code) - - return (model, processor) if return_model else processor - - -def prepare_vllm_model(pretrained_model_name_or_path, - return_model=True, - trust_remote_code=False, - tensor_parallel_size=1, - max_model_len=None, - max_num_seqs=256): - """ - Prepare and load a HuggingFace model with the correspoding processor. - - :param pretrained_model_name_or_path: model name or path - :param return_model: return model or not - :param trust_remote_code: passed to transformers - :param tensor_parallel_size: The number of GPUs to use for distributed - execution with tensor parallelism. - :param max_model_len: Model context length. If unspecified, will - be automatically derived from the model config. - :param max_num_seqs: Maximum number of sequences to be processed in a - single iteration. - :return: a tuple (model, input processor) if `return_model` is True; - otherwise, only the processor is returned. - """ - processor = transformers.AutoProcessor.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code) - - if return_model: - model = vllm.LLM(model=pretrained_model_name_or_path, - trust_remote_code=trust_remote_code, - dtype=torch.float16, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs) - - return (model, processor) if return_model else processor - - -def prepare_spacy_model(lang, name_pattern='{}_core_web_md-3.7.0'): - """ - Prepare spacy model for specific language. - - :param lang: language of sapcy model. Should be one of ["zh", - "en"] - :return: corresponding spacy model - """ - import spacy - - assert lang in ['zh', 'en'], 'Diversity only support zh and en' - model_name = name_pattern.format(lang) - logger.info(f'Loading spacy model [{model_name}]...') - compressed_model = '{}.tar.gz'.format(model_name) - - # decompress the compressed model if it's not decompressed - def decompress_model(compressed_model_path): - if not compressed_model_path.endswith('.tar.gz'): - raise ValueError('Only .tar.gz files are supported') - - decompressed_model_path = compressed_model_path.replace('.tar.gz', '') - if os.path.exists(decompressed_model_path) \ - and os.path.isdir(decompressed_model_path): - return decompressed_model_path - - ver_name = os.path.basename(decompressed_model_path) - unver_name = ver_name.rsplit('-', maxsplit=1)[0] - target_dir_in_archive = f'{ver_name}/{unver_name}/{ver_name}/' - - import tarfile - with tarfile.open(compressed_model_path, 'r:gz') as tar: - for member in tar.getmembers(): - if member.name.startswith(target_dir_in_archive): - # relative path without unnecessary directory levels - relative_path = os.path.relpath( - member.name, start=target_dir_in_archive) - target_path = os.path.join(decompressed_model_path, - relative_path) - - if member.isfile(): - # ensure the directory exists - target_directory = os.path.dirname(target_path) - os.makedirs(target_directory, exist_ok=True) - # for files, extract to the specific location - with tar.extractfile(member) as source: - with open(target_path, 'wb') as target: - target.write(source.read()) - return decompressed_model_path - - try: - diversity_model = spacy.load( - decompress_model(check_model(compressed_model))) - except: # noqa: E722 - diversity_model = spacy.load( - decompress_model(check_model(compressed_model, force=True))) - return diversity_model - - -def prepare_diffusion_model(pretrained_model_name_or_path, - diffusion_type, - torch_dtype='fp32', - revision='main', - trust_remote_code=False): + :param llm_params: LLM initialization parameters. + :return: a tuple of (model, tokenizer) """ - Prepare and load an Diffusion model from HuggingFace. + os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' - :param pretrained_model_name_or_path: input Diffusion model name - or local path to the model - :param diffusion_type: the use of the diffusion model. It can be - 'image2image', 'text2image', 'inpainting' - :param torch_dtype: the floating point to load the diffusion - model. Can be one of ['fp32', 'fp16', 'bf16'] - :param revision: The specific model version to use. It can be a - branch name, a tag name, a commit id, or any identifier allowed - by Git. - :return: a Diffusion model. - """ - AUTOINSTALL.check(['torch', 'transformers']) + model = vllm.LLM(model=pretrained_model_name_or_path, **llm_params) + tokenizer = model.get_tokenizer() - diffusion_type_to_pipeline = { - 'image2image': diffusers.AutoPipelineForImage2Image, - 'text2image': diffusers.AutoPipelineForText2Image, - 'inpainting': diffusers.AutoPipelineForInpainting - } - - if diffusion_type not in diffusion_type_to_pipeline.keys(): - raise ValueError( - f'Not support {diffusion_type} diffusion_type for diffusion ' - 'model. Can only be one of ' - '["image2image", "text2image", "inpainting"].') - - if torch_dtype not in ['fp32', 'fp16', 'bf16']: - raise ValueError( - f'Not support {torch_dtype} torch_dtype for diffusion ' - 'model. Can only be one of ' - '["fp32", "fp16", "bf16"].') - - if not is_cuda_available() and (torch_dtype == 'fp16' - or torch_dtype == 'bf16'): - raise ValueError( - 'In cpu mode, only fp32 torch_dtype can be used for diffusion' - ' model.') - - pipeline = diffusion_type_to_pipeline[diffusion_type] - if torch_dtype == 'bf16': - torch_dtype = torch.bfloat16 - elif torch_dtype == 'fp16': - torch_dtype = torch.float16 - else: - torch_dtype = torch.float32 - - model = pipeline.from_pretrained(pretrained_model_name_or_path, - revision=revision, - torch_dtype=torch_dtype, - trust_remote_code=trust_remote_code) - - return model - - -def prepare_recognizeAnything_model( - pretrained_model_name_or_path='ram_plus_swin_large_14m.pth', - input_size=384): - """ - Prepare and load recognizeAnything model. - - :param model_name: input model name. - :param input_size: the input size of the model. - """ - logger.info('Loading recognizeAnything model...') - try: - model = ram.ram_plus( - pretrained=check_model(pretrained_model_name_or_path), - image_size=input_size, - vit='swin_l') - except (RuntimeError, UnpicklingError) as e: # noqa: E722 - logger.warning(e) - model = ram.ram_plus(pretrained=check_model( - pretrained_model_name_or_path, force=True), - image_size=input_size, - vit='swin_l') - model.eval() - return model - - -def prepare_opencv_classifier(model_path): - model = cv2.CascadeClassifier(model_path) - return model + return (model, tokenizer) MODEL_FUNCTION_MAPPING = { + 'diffusion': prepare_diffusion_model, 'fasttext': prepare_fasttext_model, - 'sentencepiece': prepare_sentencepiece_for_lang, + 'huggingface': prepare_huggingface_model, 'kenlm': prepare_kenlm_model, 'nltk': prepare_nltk_model, - 'huggingface': prepare_huggingface_model, + 'opencv_classifier': prepare_opencv_classifier, + 'recognizeAnything': prepare_recognizeAnything_model, + 'sentencepiece': prepare_sentencepiece_for_lang, 'simple_aesthetics': prepare_simple_aesthetics_model, 'spacy': prepare_spacy_model, - 'diffusion': prepare_diffusion_model, 'video_blip': prepare_video_blip_model, - 'recognizeAnything': prepare_recognizeAnything_model, 'vllm': prepare_vllm_model, - 'opencv_classifier': prepare_opencv_classifier, } @@ -611,21 +610,29 @@ def prepare_model(model_type, **model_kwargs): list(MODEL_FUNCTION_MAPPING.keys())) model_func = MODEL_FUNCTION_MAPPING[model_type] model_key = partial(model_func, **model_kwargs) - # always instantiate once for possible caching - model_key() + if model_type != 'vllm': + # instantiate once for possible caching + model_key() return model_key -def move_to_cuda(model, rank): +def move_to_cuda(objs, rank): # Assuming model can be either a single module or a tuple of modules - if not isinstance(model, tuple): - model = (model, ) + if not isinstance(objs, tuple): + objs = (objs, ) - for module in model: - if callable(getattr(module, 'to', None)): + for idx, obj in enumerate(objs): + if isinstance(obj, transformers.Pipeline): + obj = obj.model + if callable(getattr(obj, 'to', None)): logger.debug( - f'Moving {module.__class__.__name__} to CUDA device {rank}') - module.to(f'cuda:{rank}') + f'Moving {obj.__class__.__name__} to CUDA device {rank}') + obj.to(f'cuda:{rank}') + if hasattr(obj, 'device'): + try: + objs[idx].device = obj.device + except: # noqa: E722 + pass def get_model(model_key=None, rank=None, use_cuda=False): diff --git a/tests/ops/mapper/test_generate_qa_from_examples_mapper.py b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py index 80568ba42c..c46b410af4 100644 --- a/tests/ops/mapper/test_generate_qa_from_examples_mapper.py +++ b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py @@ -1,17 +1,23 @@ import unittest -import json + from loguru import logger -from data_juicer.ops.mapper.generate_qa_from_examples_mapper import GenerateQAFromExamplesMapper + +from data_juicer.ops.mapper.generate_qa_from_examples_mapper import \ + GenerateQAFromExamplesMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) + # Skip tests for this OP in the GitHub actions due to disk space limitation. # These tests have been tested locally. @SKIPPED_TESTS.register_module() class GenerateQAFromExamplesMapperTest(DataJuicerTestCaseBase): text_key = 'text' - def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): + def _run_op(self, + enable_vllm=False, + llm_params=None, + sampling_params=None): op = GenerateQAFromExamplesMapper( seed_file='demos/data/demo-dataset-chatml.jsonl', example_num=3, @@ -27,20 +33,20 @@ def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): for row in dataset: logger.info(row) - # Note: If switching models causes this assert to fail, it may not be a code issue; - # the model might just have limited capabilities. self.assertIn(op.query_key, row) self.assertIn(op.response_key, row) def test(self): - sampling_params = {"max_new_tokens": 200} + sampling_params = {'max_new_tokens': 200} self._run_op(sampling_params=sampling_params) def test_vllm(self): import torch - llm_params = {"tensor_parallel_size": torch.cuda.device_count()} - sampling_params = {"max_tokens": 200} - self._run_op(enable_vllm=True, llm_params=llm_params, sampling_params=sampling_params) + llm_params = {'tensor_parallel_size': torch.cuda.device_count()} + sampling_params = {'max_tokens': 200} + self._run_op(enable_vllm=True, + llm_params=llm_params, + sampling_params=sampling_params) if __name__ == '__main__': diff --git a/tests/ops/mapper/test_generate_qa_from_text_mapper.py b/tests/ops/mapper/test_generate_qa_from_text_mapper.py index 3a0d99402e..d1a26e61c0 100644 --- a/tests/ops/mapper/test_generate_qa_from_text_mapper.py +++ b/tests/ops/mapper/test_generate_qa_from_text_mapper.py @@ -1,24 +1,31 @@ import unittest + from loguru import logger + from data_juicer.core.data import NestedDataset as Dataset -from data_juicer.ops.mapper.generate_qa_from_text_mapper import GenerateQAFromTextMapper +from data_juicer.ops.mapper.generate_qa_from_text_mapper import \ + GenerateQAFromTextMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) + # Skip tests for this OP in the GitHub actions due to disk space limitation. # These tests have been tested locally. @SKIPPED_TESTS.register_module() class GenerateQAFromTextMapperTest(DataJuicerTestCaseBase): text_key = 'text' - def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): - op = GenerateQAFromTextMapper( - enable_vllm=enable_vllm, - llm_params=llm_params, - sampling_params=sampling_params) - + def _run_op(self, + enable_vllm=False, + llm_params=None, + sampling_params=None): + op = GenerateQAFromTextMapper(enable_vllm=enable_vllm, + llm_params=llm_params, + sampling_params=sampling_params) + samples = [{ - self.text_key: '蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n' + self.text_key: + '蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n' }] dataset = Dataset.from_list(samples) @@ -26,24 +33,28 @@ def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): for row in dataset: logger.info(row) - # Note: If switching models causes this assert to fail, it may not be a code issue; - # the model might just have limited capabilities. self.assertIn(op.query_key, row) self.assertIn(op.response_key, row) def test(self): - sampling_params = {"max_new_tokens": 200} + sampling_params = {'max_new_tokens': 200} self._run_op(sampling_params=sampling_params) def test_vllm(self): import torch llm_params = { - "tensor_parallel_size": torch.cuda.device_count(), - "max_model_len": 1024, - "max_num_seqs": 16 + 'tensor_parallel_size': torch.cuda.device_count(), + 'max_model_len': 1024, + 'max_num_seqs': 16 + } + sampling_params = { + 'temperature': 0.9, + 'top_p': 0.95, + 'max_tokens': 200 } - sampling_params={'temperature': 0.9, 'top_p': 0.95, 'max_tokens': 200} - self._run_op(enable_vllm=True, llm_params=llm_params,sampling_params=sampling_params) + self._run_op(enable_vllm=True, + llm_params=llm_params, + sampling_params=sampling_params) if __name__ == '__main__': diff --git a/tests/ops/mapper/test_optimize_qa_mapper.py b/tests/ops/mapper/test_optimize_qa_mapper.py index 803ec87843..1b275fc837 100644 --- a/tests/ops/mapper/test_optimize_qa_mapper.py +++ b/tests/ops/mapper/test_optimize_qa_mapper.py @@ -1,16 +1,22 @@ import unittest + from loguru import logger + from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) + # Skip tests for this OP in the GitHub actions due to disk space limitation. # These tests have been tested locally. @SKIPPED_TESTS.register_module() class OptimizeQAMapperTest(DataJuicerTestCaseBase): query_key = 'query' - def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): + def _run_op(self, + enable_vllm=False, + llm_params=None, + sampling_params=None): op = OptimizeQAMapper( enable_vllm=enable_vllm, llm_params=llm_params, @@ -18,27 +24,29 @@ def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): ) samples = [{ - 'query': '鱼香肉丝怎么做?', - 'response': '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' + 'query': + '鱼香肉丝怎么做?', + 'response': + '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' }] for sample in samples: result = op.process(sample) logger.info(f'Output results: {result}') - # Note: If switching models causes this assert to fail, it may not be a code issue; - # the model might just have limited capabilities. self.assertNotEqual(result['query'], '') self.assertNotEqual(result['response'], '') def test(self): - sampling_params = {"max_new_tokens": 200} + sampling_params = {'max_new_tokens': 200} self._run_op(sampling_params=sampling_params) def test_vllm(self): import torch - llm_params = {"tensor_parallel_size": torch.cuda.device_count()} - sampling_params = {"max_tokens": 200} - self._run_op(enable_vllm=True, llm_params=llm_params, sampling_params=sampling_params) + llm_params = {'tensor_parallel_size': torch.cuda.device_count()} + sampling_params = {'max_tokens': 200} + self._run_op(enable_vllm=True, + llm_params=llm_params, + sampling_params=sampling_params) if __name__ == '__main__': diff --git a/tests/ops/mapper/test_optimize_query_mapper.py b/tests/ops/mapper/test_optimize_query_mapper.py index c653e39ea8..3162d6d19d 100644 --- a/tests/ops/mapper/test_optimize_query_mapper.py +++ b/tests/ops/mapper/test_optimize_query_mapper.py @@ -1,43 +1,50 @@ import unittest + from loguru import logger + from data_juicer.ops.mapper.optimize_query_mapper import OptimizeQueryMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) + # Skip tests for this OP in the GitHub actions due to disk space limitation. # These tests have been tested locally. @SKIPPED_TESTS.register_module() class OptimizeQueryMapperTest(DataJuicerTestCaseBase): - def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): + def _run_op(self, + enable_vllm=False, + llm_params=None, + sampling_params=None): op = OptimizeQueryMapper( hf_model='alibaba-pai/Qwen2-7B-Instruct-Refine', enable_vllm=enable_vllm, llm_params=llm_params, - sampling_params=sampling_params - ) + sampling_params=sampling_params) samples = [{ - 'query': '鱼香肉丝怎么做?', - 'response': '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' + 'query': + '鱼香肉丝怎么做?', + 'response': + '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' }] for sample in samples: result = op.process(sample) logger.info(f'Output results: {result}') - # Note: If switching models causes this assert to fail, it may not be a code issue; - # the model might just have limited capabilities. self.assertNotEqual(result['query'], '') - + def test(self): - sampling_params = {"max_new_tokens": 200} + sampling_params = {'max_new_tokens': 200} self._run_op(sampling_params=sampling_params) def test_vllm(self): import torch - llm_params = {"tensor_parallel_size": torch.cuda.device_count()} - sampling_params = {"max_tokens": 200} - self._run_op(enable_vllm=True, llm_params=llm_params, sampling_params=sampling_params) + llm_params = {'tensor_parallel_size': torch.cuda.device_count()} + sampling_params = {'max_tokens': 200} + self._run_op(enable_vllm=True, + llm_params=llm_params, + sampling_params=sampling_params) if __name__ == '__main__': diff --git a/tests/ops/mapper/test_optimize_response_mapper.py b/tests/ops/mapper/test_optimize_response_mapper.py index 5ae8d8bf6c..6922653ca6 100644 --- a/tests/ops/mapper/test_optimize_response_mapper.py +++ b/tests/ops/mapper/test_optimize_response_mapper.py @@ -1,42 +1,49 @@ import unittest + from loguru import logger -from data_juicer.ops.mapper.optimize_response_mapper import OptimizeResponseMapper + +from data_juicer.ops.mapper.optimize_response_mapper import \ + OptimizeResponseMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) + # Skip tests for this OP in the GitHub actions due to disk space limitation. # These tests have been tested locally. @SKIPPED_TESTS.register_module() class OptimizeResponseMapperTest(DataJuicerTestCaseBase): - - def _run_op(self, enable_vllm=False, llm_params=None, sampling_params=None): - op = OptimizeResponseMapper( - enable_vllm=enable_vllm, - llm_params=llm_params, - sampling_params=sampling_params - ) + + def _run_op(self, + enable_vllm=False, + llm_params=None, + sampling_params=None): + op = OptimizeResponseMapper(enable_vllm=enable_vllm, + llm_params=llm_params, + sampling_params=sampling_params) samples = [{ - 'query': '鱼香肉丝怎么做?', - 'response': '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' + 'query': + '鱼香肉丝怎么做?', + 'response': + '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' }] for sample in samples: result = op.process(sample) logger.info(f'Output results: {result}') - # Note: If switching models causes this assert to fail, it may not be a code issue; - # the model might just have limited capabilities. self.assertNotEqual(result['response'], '') - + def test(self): - sampling_params = {"max_new_tokens": 200} + sampling_params = {'max_new_tokens': 200} self._run_op(sampling_params=sampling_params) def test_vllm(self): import torch - llm_params = {"tensor_parallel_size": torch.cuda.device_count()} - sampling_params = {"max_tokens": 200} - self._run_op(enable_vllm=True, llm_params=llm_params, sampling_params=sampling_params) + llm_params = {'tensor_parallel_size': torch.cuda.device_count()} + sampling_params = {'max_tokens': 200} + self._run_op(enable_vllm=True, + llm_params=llm_params, + sampling_params=sampling_params) if __name__ == '__main__': From daee1f97ed0c61cbd0f52b19f37458e4627189ac Mon Sep 17 00:00:00 2001 From: "gece.gc" Date: Tue, 29 Oct 2024 02:59:19 +0000 Subject: [PATCH 16/23] update docs --- configs/config_all.yaml | 51 ++++++++++--------- .../generate_qa_from_examples_mapper.py | 30 +++++------ .../mapper/generate_qa_from_text_mapper.py | 10 ++-- data_juicer/ops/mapper/optimize_qa_mapper.py | 12 ++--- .../ops/mapper/optimize_query_mapper.py | 2 +- .../ops/mapper/optimize_response_mapper.py | 2 +- docs/Operators.md | 10 ++-- docs/Operators_ZH.md | 8 +-- 8 files changed, 65 insertions(+), 60 deletions(-) diff --git a/configs/config_all.yaml b/configs/config_all.yaml index 8d772814f1..132d187aea 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -61,29 +61,26 @@ process: - clean_links_mapper: # remove web links from text. - clean_copyright_mapper: # remove copyright comments. - expand_macro_mapper: # expand macro definitions in Latex text. - - generate_qa_from_text_mapper: # mapper to generate question and answer pairs from text. - hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa' # model name on huggingface to extract question and answer pair. - pattern: null # regular expression pattern to search for within text. - enable_vllm: false # Whether to use vllm for inference acceleration. - tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism. - max_model_len: null # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config. - max_num_seqs: 256 # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration. - sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} - fix_unicode_mapper: # fix unicode errors in text. - generate_qa_from_examples_mapper: # mapper to generate question and answer pairs from examples. - hf_model: 'Qwen/Qwen-7B-Chat' # model name on huggingface to generate instruction. - seed_file: 'demos/data/demo-dataset-chatml.jsonl' # Seed file as instruction samples to generate new instructions, chatml format. - instruct_num: 3 # the number of generated samples. - similarity_threshold: 0.7 # the similarity score threshold between the generated samples and the seed samples.Range from 0 to 1. Samples with similarity score less than this threshold will be kept. - prompt_template: null # Prompt template for generate samples. Please make sure the template contains "{augmented_data}", which corresponds to the augmented samples. - qa_pair_template: null # Prompt template for generate question and answer pair description. Please make sure the template contains two "{}" to format question and answer. Default: '【问题】\n{}\n【回答】\n{}\n'. - example_template: null # Prompt template for generate examples. Please make sure the template contains "{qa_pairs}", which corresponds to the question and answer pair description generated by param `qa_pair_template`. - qa_extraction_pattern: null # Regular expression pattern for parsing question and answer from model response. + hf_model: 'Qwen/Qwen2.5-7B-Instruct' # Model name on huggingface to generate question and answer pairs. + seed_file: 'demos/data/demo-dataset-chatml.jsonl' # Path to the seed file in chatml format. + instruct_num: 3 # The number of randomly selected seed examples. + similarity_threshold: 0.7 # the similarity score threshold between the generated samples and the seed examples. Range from 0 to 1. Samples with similarity score less than this threshold will be kept. + system_prompt: null # System prompt for guiding the generation task. + input_template: null # Template for building the input prompt. + example_template: null # Template for formatting each QA example. + qa_pair_template: null # Template for formatting a single QA pair within each example. + output_pattern: null # Regular expression pattern to extract questions and answers from model response. enable_vllm: false # Whether to use vllm for inference acceleration. - tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism. - max_model_len: null # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config. - max_num_seqs: 256 # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration. + llm_params: null # Parameters for initializing the model. sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} + - generate_qa_from_text_mapper: # mapper to generate question and answer pairs from text. + hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa' # Model name on huggingface to generate question and answer pairs. + output_pattern: null # Regular expression pattern to extract questions and answers from model response. + enable_vllm: false # Whether to use vllm for inference acceleration. + llm_params: null # Parameters for initializing the model. + sampling_params: null # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} - image_blur_mapper: # mapper to blur images. p: 0.2 # probability of the image being blured blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian'] @@ -145,13 +142,17 @@ process: delete_random_char: false # whether to open the augmentation method of deleting random characters from the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强" swap_random_char: false # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法" replace_equivalent_num: false # whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法" - - optimize_query_mapper: # optimize instruction query. - hf_model: 'alibaba-pai/Qwen2-7B-Instruct-Refine' # model name on huggingface to optimize instruction + - optimize_qa_mapper: # optimize question-answer pairs. + hf_model: 'Qwen/Qwen2.5-7B-Instruct' # model name on huggingface. + system_prompt: null # System prompt for guiding the optimization task. + input_template: null # Template for building the input for the model. + qa_pair_template: null # Template for formatting the question and answer pair. + output_pattern: null # Regular expression pattern to extract question and answer from model response. enable_vllm: false # whether to use vllm for inference acceleration. - tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism. - max_model_len: null # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config. - max_num_seqs: 256 # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration. - sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} + llm_params: null # Parameters for initializing the model. + sampling_params: null # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} + - optimize_query_mapper: # optimize query in question-answer pairs. + - optimize_response_mapper: # optimize response in question-answer pairs. - punctuation_normalization_mapper: # normalize unicode punctuations to English punctuations. - remove_bibliography_mapper: # remove bibliography from Latex text. - remove_comments_mapper: # remove comments from Latex text, code, etc. diff --git a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py index 4ba27523e0..4a37ee24d9 100644 --- a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py +++ b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py @@ -68,27 +68,27 @@ def __init__(self, """ Initialization method. - :param hf_model: Hugginface model id. - :param seed_file: Seed file path, chatml format. - :param example_num: The number of QA example. + :param hf_model: Hugginface model ID. + :param seed_file: Path to the seed file in chatml format. + :param example_num: The number of selected examples. Randomly select N examples from "seed_file" and - put them into prompt as instruction samples. + put them into prompt as QA examples. :param similarity_threshold: The similarity score threshold - between the generated samples and the seed samples. + between the generated samples and the seed examples. Range from 0 to 1. Samples with similarity score less than this threshold will be kept. - :param system_prompt: System prompt for the generation task. - :param input_template: Template for building the input for the model. - Please make sure the template contains "{examples}", which - will be filled by `example_num` formatted `example_template`. + :param system_prompt: System prompt for guiding the generation task. + :param input_template: Template for building the input prompt. It must + include "{examples}", which will be replaced by `example_num` + formatted examples defined by `example_template`. :param example_template: Template for formatting each QA example. - :param qa_pair_template: Template for formatting one-round QA pair - within each QA example. Please make sure the template contains - two "{}" to format question and answer. - :param output_pattern: Regular expression pattern for parsing - question and answer from model response. + :param qa_pair_template: Template for formatting a single QA pair + within each example. Must include two placeholders "{}" for the + question and answer. + :param output_pattern: Regular expression pattern to extract questions + and answers from model response. :param enable_vllm: Whether to use vllm for inference acceleration. - :param llm_params: LLM initialization parameters. + :param llm_params: Parameters for initializing the model. :param sampling_params: Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} :param kwargs: Extra keyword arguments. diff --git a/data_juicer/ops/mapper/generate_qa_from_text_mapper.py b/data_juicer/ops/mapper/generate_qa_from_text_mapper.py index e856e0e79b..991807af26 100644 --- a/data_juicer/ops/mapper/generate_qa_from_text_mapper.py +++ b/data_juicer/ops/mapper/generate_qa_from_text_mapper.py @@ -45,12 +45,12 @@ def __init__(self, """ Initialization method. - :param hf_model: Hugginface model id. - :param output_pattern: Regular expression pattern for parsing - question and answer from model response. + :param hf_model: Hugginface model ID. + :param output_pattern: Regular expression pattern to extract + questions and answers from model response. :param enable_vllm: Whether to use vllm for inference acceleration. - :param llm_params: LLM initialization parameters. - :param sampling_params: Sampling parameters for text generation. + :param llm_params: Parameters for initializing the model. + :param sampling_params: Sampling parameters for text generation, e.g {'temperature': 0.9, 'top_p': 0.95} :param kwargs: Extra keyword arguments. diff --git a/data_juicer/ops/mapper/optimize_qa_mapper.py b/data_juicer/ops/mapper/optimize_qa_mapper.py index bba3234aa1..8418573d20 100644 --- a/data_juicer/ops/mapper/optimize_qa_mapper.py +++ b/data_juicer/ops/mapper/optimize_qa_mapper.py @@ -40,14 +40,14 @@ def __init__(self, qa_pair_template: Optional[str] = None, output_pattern: Optional[str] = None, enable_vllm: bool = False, - llm_params: Dict = None, - sampling_params: Dict = None, + llm_params: Optional[Dict] = None, + sampling_params: Optional[Dict] = None, **kwargs): """ Initialization method. :param hf_model: Hugging Face model ID. - :param system_prompt: System prompt for the optimization task. + :param system_prompt: System prompt for guiding the optimization task. :param input_template: Template for building the input for the model. Please make sure the template contains "{qa_pair}", which corresponds to the question and answer pair generated by @@ -55,10 +55,10 @@ def __init__(self, :param qa_pair_template: Template for formatting the question and answer pair. Please make sure the template contains two "{}" to format question and answer. - :param output_pattern: Regular expression pattern for parsing - question and answer from model response. + :param output_pattern: Regular expression pattern to extract question + and answer from model response. :param enable_vllm: Whether to use VLLM for inference acceleration. - :param llm_params: LLM initialization parameters. + :param llm_params: Parameters for initializing the model. :param sampling_params: Sampling parameters for text generation (e.g., {'temperature': 0.9, 'top_p': 0.95}). :param kwargs: Extra keyword arguments. diff --git a/data_juicer/ops/mapper/optimize_query_mapper.py b/data_juicer/ops/mapper/optimize_query_mapper.py index a12c505755..6ddbe71c0c 100644 --- a/data_juicer/ops/mapper/optimize_query_mapper.py +++ b/data_juicer/ops/mapper/optimize_query_mapper.py @@ -9,7 +9,7 @@ @OPERATORS.register_module(OP_NAME) class OptimizeQueryMapper(OptimizeQAMapper): """ - Mapper to optimize only query in question-answer pairs. + Mapper to optimize query in question-answer pairs. """ DEFAULT_SYSTEM_PROMPT = '优化问答对中的问题,将其更加详细具体,但仍可以由原答案回答。只输出优化后的问题,不要输出多余内容。' diff --git a/data_juicer/ops/mapper/optimize_response_mapper.py b/data_juicer/ops/mapper/optimize_response_mapper.py index cb68cc4c42..158159a9d3 100644 --- a/data_juicer/ops/mapper/optimize_response_mapper.py +++ b/data_juicer/ops/mapper/optimize_response_mapper.py @@ -9,7 +9,7 @@ @OPERATORS.register_module(OP_NAME) class OptimizeResponseMapper(OptimizeQAMapper): """ - Mapper to optimize only response in question-answer pairs. + Mapper to optimize response in question-answer pairs. """ DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。' diff --git a/docs/Operators.md b/docs/Operators.md index 721885f0e5..63052389c5 100644 --- a/docs/Operators.md +++ b/docs/Operators.md @@ -11,7 +11,7 @@ The operators in Data-Juicer are categorized into 5 types. | Type | Number | Description | |-----------------------------------|:------:|-------------------------------------------------| | [ Formatter ]( #formatter ) | 7 | Discovers, loads, and canonicalizes source data | -| [ Mapper ]( #mapper ) | 47 | Edits and transforms samples | +| [ Mapper ]( #mapper ) | 49 | Edits and transforms samples | | [ Filter ]( #filter ) | 43 | Filters out low-quality samples | | [ Deduplicator ]( #deduplicator ) | 5 | Detects and removes duplicate samples | | [ Selector ]( #selector ) | 4 | Selects top samples based on ranking | @@ -58,9 +58,9 @@ All the specific operators are listed below, each featured with several capabili | clean_ip_mapper | General | en, zh | Removes IP addresses | | clean_links_mapper | General, Code | en, zh | Removes links, such as those starting with http or ftp | | expand_macro_mapper | LaTeX | en, zh | Expands macros usually defined at the top of TeX documents | -| generate_qa_from_text_mapper | General | en, zh | Generate question and answer pairs from text samples. | | fix_unicode_mapper | General | en, zh | Fixes broken Unicodes (by [ftfy](https://ftfy.readthedocs.io/)) | -| generate_qa_from_examples_mapper | General | en, zh | Generate question and answer pairs from examples samples. | +| generate_qa_from_examples_mapper | General | en, zh | Generate question and answer pairs based on examples. | +| generate_qa_from_text_mapper | General | en, zh | Generate question and answer pairs from text. | | image_blur_mapper | Image | - | Blur images | | image_captioning_from_gpt4v_mapper | Multimodal | - | generate samples whose texts are generated based on gpt-4-visison and the image | | image_captioning_mapper | Multimodal | - | generate samples whose captions are generated based on another model (such as blip2) and the figure within the original sample | @@ -69,7 +69,9 @@ All the specific operators are listed below, each featured with several capabili | image_tagging_mapper | Multimodal | - | Mapper to generate image tags from the input images. | | nlpaug_en_mapper | General | en | Simply augments texts in English based on the `nlpaug` library | | nlpcda_zh_mapper | General | zh | Simply augments texts in Chinese based on the `nlpcda` library | -| optimize_query_mapper | General | en, zh | Optimize instruction query samples. | +| optimize_qa_mapper | General | en, zh | Optimize both the query and response in question-answering samples. | +| optimize_query_mapper | General | en, zh | Optimize the query in question-answering samples. | +| optimize_response_mapper | General | en, zh | Optimize the response in question-answering samples. | | punctuation_normalization_mapper | General | en, zh | Normalizes various Unicode punctuations to their ASCII equivalents | | remove_bibliography_mapper | LaTeX | en, zh | Removes the bibliography of TeX documents | | remove_comments_mapper | LaTeX | en, zh | Removes the comments of TeX documents | diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md index 2f07b2b561..25f32c247d 100644 --- a/docs/Operators_ZH.md +++ b/docs/Operators_ZH.md @@ -11,7 +11,7 @@ Data-Juicer 中的算子分为以下 5 种类型。 | 类型 | 数量 | 描述 | |------------------------------------|:--:|---------------| | [ Formatter ]( #formatter ) | 7 | 发现、加载、规范化原始数据 | -| [ Mapper ]( #mapper ) | 47 | 对数据样本进行编辑和转换 | +| [ Mapper ]( #mapper ) | 49 | 对数据样本进行编辑和转换 | | [ Filter ]( #filter ) | 43 | 过滤低质量样本 | | [ Deduplicator ]( #deduplicator ) | 5 | 识别、删除重复样本 | | [ Selector ]( #selector ) | 4 | 基于排序选取高质量样本 | @@ -57,9 +57,9 @@ Data-Juicer 中的算子分为以下 5 种类型。 | clean_ip_mapper | General | en, zh | 删除 IP 地址 | | clean_links_mapper | General, Code | en, zh | 删除链接,例如以 http 或 ftp 开头的 | | expand_macro_mapper | LaTeX | en, zh | 扩展通常在 TeX 文档顶部定义的宏 | -| generate_qa_from_text_mapper | General | en, zh | 从文本中生成问答对 | | fix_unicode_mapper | General | en, zh | 修复损坏的 Unicode(借助 [ftfy](https://ftfy.readthedocs.io/)) | -| generate_qa_from_examples_mapper | General | en, zh | 根据种子数据,生成新的样本。 | +| generate_qa_from_examples_mapper | General | en, zh | 根据种子数据,生成新的对话样本。 | +| generate_qa_from_text_mapper | General | en, zh | 从文本中生成问答对 | | image_blur_mapper | Image | - | 对图像进行模糊处理 | | image_captioning_from_gpt4v_mapper | Multimodal | - | 基于gpt-4-vision和图像生成文本 | | image_captioning_mapper | Multimodal | - | 生成样本,其标题是根据另一个辅助模型(例如 blip2)和原始样本中的图形生成的。 | @@ -68,7 +68,9 @@ Data-Juicer 中的算子分为以下 5 种类型。 | image_tagging_mapper | Multimodal | - | 从输入图片中生成图片标签 | | nlpaug_en_mapper | General | en | 使用`nlpaug`库对英语文本进行简单增强 | | nlpcda_zh_mapper | General | zh | 使用`nlpcda`库对中文文本进行简单增强 | +| optimize_qa_mapper | General | en, zh | 指令优化,优化问题和答案 | | optimize_query_mapper | General | en, zh | 指令优化,优化 query | +| optimize_response_mapper | General | en, zh | 指令优化,优化 response | | punctuation_normalization_mapper | General | en, zh | 将各种 Unicode 标点符号标准化为其 ASCII 等效项 | | remove_bibliography_mapper | LaTeX | en, zh | 删除 TeX 文档的参考文献 | | remove_comments_mapper | LaTeX | en, zh | 删除 TeX 文档中的注释 | From dad81cd235285cf5cae68522e2378e494667dfdd Mon Sep 17 00:00:00 2001 From: "gece.gc" Date: Thu, 31 Oct 2024 09:10:06 +0000 Subject: [PATCH 17/23] refine model loading --- configs/config_all.yaml | 6 +- .../generate_qa_from_examples_mapper.py | 30 ++-- .../mapper/generate_qa_from_text_mapper.py | 35 ++--- data_juicer/ops/mapper/optimize_qa_mapper.py | 43 +++--- .../ops/mapper/optimize_query_mapper.py | 2 +- data_juicer/utils/model_utils.py | 137 ++++++++++-------- .../test_generate_qa_from_examples_mapper.py | 20 +-- .../test_generate_qa_from_text_mapper.py | 28 ++-- tests/ops/mapper/test_optimize_qa_mapper.py | 41 +++--- .../ops/mapper/test_optimize_query_mapper.py | 32 ++-- .../mapper/test_optimize_response_mapper.py | 30 ++-- 11 files changed, 215 insertions(+), 189 deletions(-) diff --git a/configs/config_all.yaml b/configs/config_all.yaml index 132d187aea..aa07eeeea4 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -73,13 +73,13 @@ process: qa_pair_template: null # Template for formatting a single QA pair within each example. output_pattern: null # Regular expression pattern to extract questions and answers from model response. enable_vllm: false # Whether to use vllm for inference acceleration. - llm_params: null # Parameters for initializing the model. + model_params: null # Parameters for initializing the model. sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} - generate_qa_from_text_mapper: # mapper to generate question and answer pairs from text. hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa' # Model name on huggingface to generate question and answer pairs. output_pattern: null # Regular expression pattern to extract questions and answers from model response. enable_vllm: false # Whether to use vllm for inference acceleration. - llm_params: null # Parameters for initializing the model. + model_params: null # Parameters for initializing the model. sampling_params: null # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} - image_blur_mapper: # mapper to blur images. p: 0.2 # probability of the image being blured @@ -149,7 +149,7 @@ process: qa_pair_template: null # Template for formatting the question and answer pair. output_pattern: null # Regular expression pattern to extract question and answer from model response. enable_vllm: false # whether to use vllm for inference acceleration. - llm_params: null # Parameters for initializing the model. + model_params: null # Parameters for initializing the model. sampling_params: null # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} - optimize_query_mapper: # optimize query in question-answer pairs. - optimize_response_mapper: # optimize response in question-answer pairs. diff --git a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py index 4a37ee24d9..ceaf33d9d9 100644 --- a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py +++ b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py @@ -46,7 +46,7 @@ class GenerateQAFromExamplesMapper(Mapper): DEFAULT_INPUT_TEMPLATE = '{examples}' DEFAULT_EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n{qa_pairs}' DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n' - DEFAULT_OUTPUT_PATTERN = r'【问题】\s*(.*?)\s*【回答】\s*(.*?)\s*(?=【问题】|$)' + DEFAULT_OUTPUT_PATTERN = r'【问题】(.*?)【回答】(.*?)(?=【问题】|$)' _accelerator = 'cuda' @@ -62,7 +62,7 @@ def __init__(self, qa_pair_template: Optional[str] = None, output_pattern: Optional[str] = None, enable_vllm: bool = False, - llm_params: Optional[Dict] = None, + model_params: Optional[Dict] = None, sampling_params: Optional[Dict] = None, **kwargs): """ @@ -88,13 +88,12 @@ def __init__(self, :param output_pattern: Regular expression pattern to extract questions and answers from model response. :param enable_vllm: Whether to use vllm for inference acceleration. - :param llm_params: Parameters for initializing the model. + :param model_params: Parameters for initializing the model. :param sampling_params: Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} :param kwargs: Extra keyword arguments. """ super().__init__(**kwargs) - self.num_proc = 1 if not seed_file: raise ValueError( @@ -114,24 +113,29 @@ def __init__(self, self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN self.enable_vllm = enable_vllm - llm_params = llm_params or {} + model_params = model_params or {} sampling_params = sampling_params or {} if enable_vllm: assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if llm_params.get('tensor_parallel_size', 1) > 1: - self.num_proc = 1 + # cannot initialize vllm replicas on different GPUs + self.num_proc = 1 + if model_params.get('tensor_parallel_size') is None: + tensor_parallel_size = torch.cuda.device_count() + logger.info(f'Set tensor_parallel_size to \ + {tensor_parallel_size} for vllm.') + model_params['tensor_parallel_size'] = tensor_parallel_size self.model_key = prepare_model( model_type='vllm', pretrained_model_name_or_path=hf_model, - **llm_params) + **model_params) self.sampling_params = vllm.SamplingParams(**sampling_params) else: self.model_key = prepare_model( model_type='huggingface', pretrained_model_name_or_path=hf_model, return_pipe=True, - **llm_params) + **model_params) self.sampling_params = sampling_params self.seed_qa_samples = self._load_seed_qa_samples() @@ -198,17 +202,15 @@ def format_qa_pairs(qa_example): def parse_output(self, raw_output): logger.debug(raw_output) - matches = re.findall(self.output_pattern, raw_output, re.DOTALL) output_qa_pairs = [] + matches = re.findall(self.output_pattern, raw_output, re.DOTALL) for match in matches: question, answer = match - question = question.strip() - answer = answer.strip() - output_qa_pairs.append((question, answer)) + output_qa_pairs.append((question.strip(), answer.strip())) return output_qa_pairs def process_single(self, sample=None, rank=None): - model, processor = get_model(self.model_key, rank, self.use_cuda()) + model, _ = get_model(self.model_key, rank, self.use_cuda()) random_qa_samples = random.sample(self.seed_qa_samples, self.example_num) diff --git a/data_juicer/ops/mapper/generate_qa_from_text_mapper.py b/data_juicer/ops/mapper/generate_qa_from_text_mapper.py index 991807af26..248dba4280 100644 --- a/data_juicer/ops/mapper/generate_qa_from_text_mapper.py +++ b/data_juicer/ops/mapper/generate_qa_from_text_mapper.py @@ -39,7 +39,7 @@ def __init__(self, *, output_pattern: Optional[str] = None, enable_vllm: bool = False, - llm_params: Optional[Dict] = None, + model_params: Optional[Dict] = None, sampling_params: Optional[Dict] = None, **kwargs): """ @@ -49,7 +49,7 @@ def __init__(self, :param output_pattern: Regular expression pattern to extract questions and answers from model response. :param enable_vllm: Whether to use vllm for inference acceleration. - :param llm_params: Parameters for initializing the model. + :param model_params: Parameters for initializing the model. :param sampling_params: Sampling parameters for text generation, e.g {'temperature': 0.9, 'top_p': 0.95} :param kwargs: Extra keyword arguments. @@ -71,46 +71,47 @@ def __init__(self, super().__init__(**kwargs) if output_pattern is None: - self.output_pattern = r'Human: (.*?)\nAssistant: (.*?)(?=\nHuman|$)' # noqa: E501 + self.output_pattern = r'Human:(.*?)Assistant:(.*?)(?=Human|$)' # noqa: E501 else: self.output_pattern = output_pattern self.enable_vllm = enable_vllm - llm_params = llm_params or {} + model_params = model_params or {} sampling_params = sampling_params or {} if enable_vllm: assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if llm_params.get('tensor_parallel_size', 1) > 1: - self.num_proc = 1 + # cannot initialize vllm replicas on different GPUs + self.num_proc = 1 + if model_params.get('tensor_parallel_size') is None: + tensor_parallel_size = torch.cuda.device_count() + logger.info(f'Set tensor_parallel_size to \ + {tensor_parallel_size} for vllm.') + model_params['tensor_parallel_size'] = tensor_parallel_size self.model_key = prepare_model( model_type='vllm', pretrained_model_name_or_path=hf_model, - **llm_params) + **model_params) self.sampling_params = vllm.SamplingParams(**sampling_params) else: self.model_key = prepare_model( model_type='huggingface', pretrained_model_name_or_path=hf_model, return_pipe=True, - **llm_params) + **model_params) self.sampling_params = sampling_params def parse_output(self, raw_output): - """Extract qestion and answer pair from model output response.""" + logger.debug(raw_output) qa_list = [] - - pat = re.compile(self.output_pattern, re.DOTALL) - qa_pairs = pat.findall(raw_output) - - for qa in qa_pairs: - user, assistant = qa + matches = re.findall(self.output_pattern, raw_output, re.DOTALL) + for match in matches: + user, assistant = match qa_list.append((user.strip(), assistant.strip())) - return qa_list def process_batched(self, samples, rank=None): - model, processor = get_model(self.model_key, rank, self.use_cuda()) + model, _ = get_model(self.model_key, rank, self.use_cuda()) input_keys = samples.keys() num_samples = len(samples[next(iter(input_keys))]) diff --git a/data_juicer/ops/mapper/optimize_qa_mapper.py b/data_juicer/ops/mapper/optimize_qa_mapper.py index 8418573d20..b16288db22 100644 --- a/data_juicer/ops/mapper/optimize_qa_mapper.py +++ b/data_juicer/ops/mapper/optimize_qa_mapper.py @@ -1,6 +1,8 @@ import re from typing import Dict, Optional +from loguru import logger + from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model @@ -20,15 +22,15 @@ class OptimizeQAMapper(Mapper): """ # avoid leading whitespace - DEFAULT_SYSTEM_PROMPT = ('请优化输入的问答对,使【问题】和【回答】都更加详细、准确。\n' - '按照以下格式输出:\n' + DEFAULT_SYSTEM_PROMPT = ('请优化输入的问答对,使【问题】和【回答】都更加详细、准确。' + '必须按照以下标记格式,直接输出优化后的问答对:\n' '【问题】\n' '优化后的问题\n' '【回答】\n' '优化后的回答') - DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n{qa_pair}' + DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n{}' DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}' - DEFAULT_OUTPUT_PATTERN = r'【问题】\s*(.*?)\s*【回答】\s*(.*)' + DEFAULT_OUTPUT_PATTERN = r'.*?【问题】\s*(.*?)\s*【回答】\s*(.*)' _accelerator = 'cuda' @@ -40,7 +42,7 @@ def __init__(self, qa_pair_template: Optional[str] = None, output_pattern: Optional[str] = None, enable_vllm: bool = False, - llm_params: Optional[Dict] = None, + model_params: Optional[Dict] = None, sampling_params: Optional[Dict] = None, **kwargs): """ @@ -49,7 +51,7 @@ def __init__(self, :param hf_model: Hugging Face model ID. :param system_prompt: System prompt for guiding the optimization task. :param input_template: Template for building the input for the model. - Please make sure the template contains "{qa_pair}", which + Please make sure the template contains one placeholder "{}", which corresponds to the question and answer pair generated by param `qa_pair_template`. :param qa_pair_template: Template for formatting the question and @@ -58,7 +60,7 @@ def __init__(self, :param output_pattern: Regular expression pattern to extract question and answer from model response. :param enable_vllm: Whether to use VLLM for inference acceleration. - :param llm_params: Parameters for initializing the model. + :param model_params: Parameters for initializing the model. :param sampling_params: Sampling parameters for text generation (e.g., {'temperature': 0.9, 'top_p': 0.95}). :param kwargs: Extra keyword arguments. @@ -72,41 +74,48 @@ def __init__(self, self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN self.enable_vllm = enable_vllm - llm_params = llm_params or {} + model_params = model_params or {} sampling_params = sampling_params or {} if enable_vllm: assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if llm_params.get('tensor_parallel_size', 1) > 1: - self.num_proc = 1 + # cannot initialize vllm replicas on different GPUs + self.num_proc = 1 + if model_params.get('tensor_parallel_size') is None: + tensor_parallel_size = torch.cuda.device_count() + logger.info(f'Set tensor_parallel_size to \ + {tensor_parallel_size} for vllm.') + model_params['tensor_parallel_size'] = tensor_parallel_size self.model_key = prepare_model( model_type='vllm', pretrained_model_name_or_path=hf_model, - **llm_params) + **model_params) self.sampling_params = vllm.SamplingParams(**sampling_params) else: self.model_key = prepare_model( model_type='huggingface', pretrained_model_name_or_path=hf_model, - return_pipe=False, - **llm_params) + return_pipe=True, + **model_params) self.sampling_params = sampling_params def build_input(self, sample): qa_pair = self.qa_pair_template.format(sample[self.query_key], sample[self.response_key]) - input_prompt = self.input_template.format(qa_pair=qa_pair) + input_prompt = self.input_template.format(qa_pair) return input_prompt def parse_output(self, raw_output): - match = re.match(self.output_pattern, raw_output) - if match: + logger.debug(raw_output) + matches = re.findall(self.output_pattern, raw_output, re.DOTALL) + if matches: + match = matches[0] return match.group(1).strip(), match.group(2).strip() else: return None, None def process_single(self, sample=None, rank=None): - model, processor = get_model(self.model_key, rank, self.use_cuda()) + model, _ = get_model(self.model_key, rank, self.use_cuda()) input_prompt = self.build_input(sample) messages = [{ diff --git a/data_juicer/ops/mapper/optimize_query_mapper.py b/data_juicer/ops/mapper/optimize_query_mapper.py index 6ddbe71c0c..dd227b4c1d 100644 --- a/data_juicer/ops/mapper/optimize_query_mapper.py +++ b/data_juicer/ops/mapper/optimize_query_mapper.py @@ -12,7 +12,7 @@ class OptimizeQueryMapper(OptimizeQAMapper): Mapper to optimize query in question-answer pairs. """ - DEFAULT_SYSTEM_PROMPT = '优化问答对中的问题,将其更加详细具体,但仍可以由原答案回答。只输出优化后的问题,不要输出多余内容。' + DEFAULT_SYSTEM_PROMPT = '优化问答对中的【问题】,将其更加详细具体,但仍可以由原答案回答。只输出优化后的【问题】,不要输出多余内容。' # noqa: E501 _accelerator = 'cuda' diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py index 7c6d59ba19..f70e204559 100644 --- a/data_juicer/utils/model_utils.py +++ b/data_juicer/utils/model_utils.py @@ -166,7 +166,7 @@ def prepare_diffusion_model(pretrained_model_name_or_path, return model -def prepare_fasttext_model(model_name='lid.176.bin'): +def prepare_fasttext_model(model_name='lid.176.bin', **model_params): """ Prepare and load a fasttext model. @@ -186,26 +186,29 @@ def prepare_huggingface_model(pretrained_model_name_or_path, return_model=True, return_pipe=False, pipe_task='text-generation', - **llm_params): + **model_params): """ Prepare and load a HuggingFace model with the correspoding processor. :param pretrained_model_name_or_path: model name or path :param return_model: return model or not :param return_pipe: whether to wrap model into pipeline - :param llm_params: LLM initialization parameters. + :param model_params: model initialization parameters. :return: a tuple of (model, input processor) if `return_model` is True; otherwise, only the processor is returned. """ # require torch for transformer model AUTOINSTALL.check(['torch']) + if 'device' in model_params: + model_params['device_map'] = model_params.pop('device') + processor = transformers.AutoProcessor.from_pretrained( - pretrained_model_name_or_path, **llm_params) + pretrained_model_name_or_path, **model_params) if return_model: config = transformers.AutoConfig.from_pretrained( - pretrained_model_name_or_path, **llm_params) + pretrained_model_name_or_path, **model_params) if hasattr(config, 'auto_map'): class_name = next( (k for k in config.auto_map if k.startswith('AutoModel')), @@ -216,26 +219,25 @@ def prepare_huggingface_model(pretrained_model_name_or_path, model_class = getattr(transformers, class_name) model = model_class.from_pretrained(pretrained_model_name_or_path, - **llm_params) + **model_params) if return_pipe: if isinstance(processor, transformers.PreTrainedTokenizerBase): - pipe_param = {'tokenizer': processor} + pipe_params = {'tokenizer': processor} elif isinstance(processor, transformers.SequenceFeatureExtractor): - pipe_param = {'feature_extractor': processor} + pipe_params = {'feature_extractor': processor} elif isinstance(processor, transformers.BaseImageProcessor): - pipe_param = {'image_processor': processor} + pipe_params = {'image_processor': processor} pipe = transformers.pipeline(task=pipe_task, model=model, config=config, - device='cpu', - **pipe_param) + **pipe_params) model = pipe return (model, processor) if return_model else processor -def prepare_kenlm_model(lang, name_pattern='{}.arpa.bin'): +def prepare_kenlm_model(lang, name_pattern='{}.arpa.bin', **model_params): """ Prepare and load a kenlm model. @@ -243,17 +245,20 @@ def prepare_kenlm_model(lang, name_pattern='{}.arpa.bin'): :param lang: language to render model name :return: model instance. """ + model_params.pop('device') + model_name = name_pattern.format(lang) logger.info('Loading kenlm language model...') try: - kenlm_model = kenlm.Model(check_model(model_name)) + kenlm_model = kenlm.Model(check_model(model_name), **model_params) except: # noqa: E722 - kenlm_model = kenlm.Model(check_model(model_name, force=True)) + kenlm_model = kenlm.Model(check_model(model_name, force=True), + **model_params) return kenlm_model -def prepare_nltk_model(lang, name_pattern='punkt.{}.pickle'): +def prepare_nltk_model(lang, name_pattern='punkt.{}.pickle', **model_params): """ Prepare and load a nltk punkt model. @@ -261,6 +266,8 @@ def prepare_nltk_model(lang, name_pattern='punkt.{}.pickle'): :param lang: language to render model name :return: model instance. """ + model_params.pop('device') + nltk_to_punkt = { 'en': 'english', 'fr': 'french', @@ -274,20 +281,22 @@ def prepare_nltk_model(lang, name_pattern='punkt.{}.pickle'): logger.info('Loading nltk punkt split model...') try: - nltk_model = nltk.data.load(check_model(model_name)) + nltk_model = nltk.data.load(check_model(model_name), **model_params) except: # noqa: E722 - nltk_model = nltk.data.load(check_model(model_name, force=True)) + nltk_model = nltk.data.load(check_model(model_name, force=True), + **model_params) return nltk_model -def prepare_opencv_classifier(model_path): +def prepare_opencv_classifier(model_path, **model_params): model = cv2.CascadeClassifier(model_path) return model def prepare_recognizeAnything_model( pretrained_model_name_or_path='ram_plus_swin_large_14m.pth', - input_size=384): + input_size=384, + **model_params): """ Prepare and load recognizeAnything model. @@ -295,6 +304,7 @@ def prepare_recognizeAnything_model( :param input_size: the input size of the model. """ logger.info('Loading recognizeAnything model...') + try: model = ram.ram_plus( pretrained=check_model(pretrained_model_name_or_path), @@ -306,11 +316,12 @@ def prepare_recognizeAnything_model( pretrained_model_name_or_path, force=True), image_size=input_size, vit='swin_l') - model.eval() + device = model_params.pop('device') or 'cpu' + model.to(device).eval() return model -def prepare_sentencepiece_model(model_path): +def prepare_sentencepiece_model(model_path, **model_params): """ Prepare and load a sentencepiece model. @@ -326,7 +337,9 @@ def prepare_sentencepiece_model(model_path): return sentencepiece_model -def prepare_sentencepiece_for_lang(lang, name_pattern='{}.sp.model'): +def prepare_sentencepiece_for_lang(lang, + name_pattern='{}.sp.model', + **model_params): """ Prepare and load a sentencepiece model for specific langauge. @@ -340,8 +353,9 @@ def prepare_sentencepiece_for_lang(lang, name_pattern='{}.sp.model'): def prepare_simple_aesthetics_model(pretrained_model_name_or_path, + *, return_model=True, - trust_remote_code=False): + **model_params): """ Prepare and load a simple aesthetics model. @@ -350,32 +364,34 @@ def prepare_simple_aesthetics_model(pretrained_model_name_or_path, :return: a tuple (model, input processor) if `return_model` is True; otherwise, only the processor is returned. """ + if 'device' in model_params: + model_params['device_map'] = model_params.pop('device') + processor = transformers.CLIPProcessor.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code) + pretrained_model_name_or_path, **model_params) if not return_model: return processor else: if 'v1' in pretrained_model_name_or_path: model = aes_pre.AestheticsPredictorV1.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code) + pretrained_model_name_or_path, **model_params) elif ('v2' in pretrained_model_name_or_path and 'linear' in pretrained_model_name_or_path): model = aes_pre.AestheticsPredictorV2Linear.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code) + pretrained_model_name_or_path, **model_params) elif ('v2' in pretrained_model_name_or_path and 'relu' in pretrained_model_name_or_path): model = aes_pre.AestheticsPredictorV2ReLU.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code) + pretrained_model_name_or_path, **model_params) else: raise ValueError( 'Not support {}'.format(pretrained_model_name_or_path)) return (model, processor) -def prepare_spacy_model(lang, name_pattern='{}_core_web_md-3.7.0'): +def prepare_spacy_model(lang, + name_pattern='{}_core_web_md-3.7.0', + **model_params): """ Prepare spacy model for specific language. @@ -434,8 +450,9 @@ def decompress_model(compressed_model_path): def prepare_video_blip_model(pretrained_model_name_or_path, + *, return_model=True, - trust_remote_code=False): + **model_params): """ Prepare and load a video-clip model with the correspoding processor. @@ -445,6 +462,8 @@ def prepare_video_blip_model(pretrained_model_name_or_path, :return: a tuple (model, input processor) if `return_model` is True; otherwise, only the processor is returned. """ + if 'device' in model_params: + model_params['device_map'] = model_params.pop('device') class VideoBlipVisionModel(transformers.Blip2VisionModel): """A simple, augmented version of Blip2VisionModel to handle @@ -564,25 +583,28 @@ def __init__(self, config: transformers.Blip2Config) -> None: self.post_init() processor = transformers.AutoProcessor.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code) + pretrained_model_name_or_path, **model_params) if return_model: model_class = VideoBlipForConditionalGeneration - model = model_class.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code) + model = model_class.from_pretrained(pretrained_model_name_or_path, + **model_params) return (model, processor) if return_model else processor -def prepare_vllm_model(pretrained_model_name_or_path, **llm_params): +def prepare_vllm_model(pretrained_model_name_or_path, **model_params): """ Prepare and load a HuggingFace model with the correspoding processor. :param pretrained_model_name_or_path: model name or path - :param llm_params: LLM initialization parameters. + :param model_params: LLM initialization parameters. :return: a tuple of (model, tokenizer) """ os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' - model = vllm.LLM(model=pretrained_model_name_or_path, **llm_params) + if model_params.get('device', '').startswith('cuda:'): + model_params['device'] = 'cuda' + + model = vllm.LLM(model=pretrained_model_name_or_path, **model_params) tokenizer = model.get_tokenizer() return (model, tokenizer) @@ -603,6 +625,10 @@ def prepare_vllm_model(pretrained_model_name_or_path, **llm_params): 'vllm': prepare_vllm_model, } +_MODELS_WITHOUT_FILE_LOCK = { + 'kenlm', 'nltk', 'recognizeAnything', 'sentencepiece', 'spacy' +} + def prepare_model(model_type, **model_kwargs): assert (model_type in MODEL_FUNCTION_MAPPING.keys() @@ -610,31 +636,12 @@ def prepare_model(model_type, **model_kwargs): list(MODEL_FUNCTION_MAPPING.keys())) model_func = MODEL_FUNCTION_MAPPING[model_type] model_key = partial(model_func, **model_kwargs) - if model_type != 'vllm': - # instantiate once for possible caching + if model_type in _MODELS_WITHOUT_FILE_LOCK: + # initialize once in the main process to safely download model files model_key() return model_key -def move_to_cuda(objs, rank): - # Assuming model can be either a single module or a tuple of modules - if not isinstance(objs, tuple): - objs = (objs, ) - - for idx, obj in enumerate(objs): - if isinstance(obj, transformers.Pipeline): - obj = obj.model - if callable(getattr(obj, 'to', None)): - logger.debug( - f'Moving {obj.__class__.__name__} to CUDA device {rank}') - obj.to(f'cuda:{rank}') - if hasattr(obj, 'device'): - try: - objs[idx].device = obj.device - except: # noqa: E722 - pass - - def get_model(model_key=None, rank=None, use_cuda=False): if model_key is None: return None @@ -644,11 +651,13 @@ def get_model(model_key=None, rank=None, use_cuda=False): logger.debug( f'{model_key} not found in MODEL_ZOO ({mp.current_process().name})' ) - MODEL_ZOO[model_key] = model_key() - if use_cuda: - rank = 0 if rank is None else rank - rank = rank % cuda_device_count() - move_to_cuda(MODEL_ZOO[model_key], rank) + if use_cuda: + rank = rank if rank is not None else 0 + rank = rank % cuda_device_count() + device = f'cuda:{rank}' + else: + device = 'cpu' + MODEL_ZOO[model_key] = model_key(device=device) return MODEL_ZOO[model_key] diff --git a/tests/ops/mapper/test_generate_qa_from_examples_mapper.py b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py index c46b410af4..60348af612 100644 --- a/tests/ops/mapper/test_generate_qa_from_examples_mapper.py +++ b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py @@ -14,24 +14,20 @@ class GenerateQAFromExamplesMapperTest(DataJuicerTestCaseBase): text_key = 'text' - def _run_op(self, - enable_vllm=False, - llm_params=None, - sampling_params=None): + def _run_op(self, enable_vllm=False, sampling_params=None, num_proc=1): op = GenerateQAFromExamplesMapper( seed_file='demos/data/demo-dataset-chatml.jsonl', example_num=3, enable_vllm=enable_vllm, - llm_params=llm_params, sampling_params=sampling_params, ) from data_juicer.format.empty_formatter import EmptyFormatter dataset = EmptyFormatter(3, [self.text_key]).load_dataset() - dataset = dataset.map(op.process) + results = dataset.map(op.process, num_proc=num_proc, with_rank=True) - for row in dataset: + for row in results: logger.info(row) self.assertIn(op.query_key, row) self.assertIn(op.response_key, row) @@ -40,13 +36,13 @@ def test(self): sampling_params = {'max_new_tokens': 200} self._run_op(sampling_params=sampling_params) + def test_multi_process(self): + sampling_params = {'max_new_tokens': 200} + self._run_op(sampling_params=sampling_params, num_proc=3) + def test_vllm(self): - import torch - llm_params = {'tensor_parallel_size': torch.cuda.device_count()} sampling_params = {'max_tokens': 200} - self._run_op(enable_vllm=True, - llm_params=llm_params, - sampling_params=sampling_params) + self._run_op(enable_vllm=True, sampling_params=sampling_params) if __name__ == '__main__': diff --git a/tests/ops/mapper/test_generate_qa_from_text_mapper.py b/tests/ops/mapper/test_generate_qa_from_text_mapper.py index d1a26e61c0..14834f5d1b 100644 --- a/tests/ops/mapper/test_generate_qa_from_text_mapper.py +++ b/tests/ops/mapper/test_generate_qa_from_text_mapper.py @@ -17,21 +17,26 @@ class GenerateQAFromTextMapperTest(DataJuicerTestCaseBase): def _run_op(self, enable_vllm=False, - llm_params=None, - sampling_params=None): + model_params=None, + sampling_params=None, + num_proc=1): + op = GenerateQAFromTextMapper(enable_vllm=enable_vllm, - llm_params=llm_params, + model_params=model_params, sampling_params=sampling_params) samples = [{ self.text_key: '蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n' + }, { + self.text_key: + '四大名著是指《水浒传》《三国演义》《西游记》《红楼梦》四部长篇小说,作者分别是施耐庵、罗贯中、吴承恩、曹雪芹。' }] dataset = Dataset.from_list(samples) - dataset = dataset.map(op.process) + results = dataset.map(op.process, num_proc=num_proc, with_rank=True) - for row in dataset: + for row in results: logger.info(row) self.assertIn(op.query_key, row) self.assertIn(op.response_key, row) @@ -40,20 +45,19 @@ def test(self): sampling_params = {'max_new_tokens': 200} self._run_op(sampling_params=sampling_params) + def test_multi_process(self): + sampling_params = {'max_new_tokens': 200} + self._run_op(sampling_params=sampling_params, num_proc=2) + def test_vllm(self): - import torch - llm_params = { - 'tensor_parallel_size': torch.cuda.device_count(), - 'max_model_len': 1024, - 'max_num_seqs': 16 - } + model_params = {'max_model_len': 1024, 'max_num_seqs': 16} sampling_params = { 'temperature': 0.9, 'top_p': 0.95, 'max_tokens': 200 } self._run_op(enable_vllm=True, - llm_params=llm_params, + model_params=model_params, sampling_params=sampling_params) diff --git a/tests/ops/mapper/test_optimize_qa_mapper.py b/tests/ops/mapper/test_optimize_qa_mapper.py index 1b275fc837..0fd21b5867 100644 --- a/tests/ops/mapper/test_optimize_qa_mapper.py +++ b/tests/ops/mapper/test_optimize_qa_mapper.py @@ -2,6 +2,7 @@ from loguru import logger +from data_juicer.core.data import NestedDataset as Dataset from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) @@ -11,42 +12,40 @@ # These tests have been tested locally. @SKIPPED_TESTS.register_module() class OptimizeQAMapperTest(DataJuicerTestCaseBase): - query_key = 'query' - - def _run_op(self, - enable_vllm=False, - llm_params=None, - sampling_params=None): - op = OptimizeQAMapper( - enable_vllm=enable_vllm, - llm_params=llm_params, - sampling_params=sampling_params, - ) + + def _run_op(self, enable_vllm=False, sampling_params=None, num_proc=1): + + op = OptimizeQAMapper(enable_vllm=enable_vllm, + sampling_params=sampling_params) samples = [{ 'query': '鱼香肉丝怎么做?', 'response': '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' + }, { + 'query': '什么是蚂蚁上树?', + 'response': '蚂蚁上树是一道中国菜。' }] + dataset = Dataset.from_list(samples) + results = dataset.map(op.process, num_proc=num_proc, with_rank=True) - for sample in samples: - result = op.process(sample) - logger.info(f'Output results: {result}') - self.assertNotEqual(result['query'], '') - self.assertNotEqual(result['response'], '') + for row in results: + logger.info(f'Output results: {row}') + self.assertNotEqual(row['query'], '') + self.assertNotEqual(row['response'], '') def test(self): sampling_params = {'max_new_tokens': 200} self._run_op(sampling_params=sampling_params) + def test_multi_process(self): + sampling_params = {'max_new_tokens': 200} + self._run_op(sampling_params=sampling_params, num_proc=2) + def test_vllm(self): - import torch - llm_params = {'tensor_parallel_size': torch.cuda.device_count()} sampling_params = {'max_tokens': 200} - self._run_op(enable_vllm=True, - llm_params=llm_params, - sampling_params=sampling_params) + self._run_op(enable_vllm=True, sampling_params=sampling_params) if __name__ == '__main__': diff --git a/tests/ops/mapper/test_optimize_query_mapper.py b/tests/ops/mapper/test_optimize_query_mapper.py index 3162d6d19d..8d6069954f 100644 --- a/tests/ops/mapper/test_optimize_query_mapper.py +++ b/tests/ops/mapper/test_optimize_query_mapper.py @@ -2,6 +2,7 @@ from loguru import logger +from data_juicer.core.data import NestedDataset as Dataset from data_juicer.ops.mapper.optimize_query_mapper import OptimizeQueryMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) @@ -12,14 +13,13 @@ @SKIPPED_TESTS.register_module() class OptimizeQueryMapperTest(DataJuicerTestCaseBase): - def _run_op(self, - enable_vllm=False, - llm_params=None, - sampling_params=None): + def _run_op(self, enable_vllm=False, sampling_params=None, num_proc=1): + op = OptimizeQueryMapper( hf_model='alibaba-pai/Qwen2-7B-Instruct-Refine', + input_template='{}', + qa_pair_template='{}', enable_vllm=enable_vllm, - llm_params=llm_params, sampling_params=sampling_params) samples = [{ @@ -27,24 +27,28 @@ def _run_op(self, '鱼香肉丝怎么做?', 'response': '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' + }, { + 'query': '什么是蚂蚁上树?', + 'response': '蚂蚁上树是一道中国菜。' }] + dataset = Dataset.from_list(samples) + results = dataset.map(op.process, num_proc=num_proc, with_rank=True) - for sample in samples: - result = op.process(sample) - logger.info(f'Output results: {result}') - self.assertNotEqual(result['query'], '') + for row in results: + logger.info(f'Output results: {row}') + self.assertNotEqual(row['query'], '') def test(self): sampling_params = {'max_new_tokens': 200} self._run_op(sampling_params=sampling_params) + def test_multi_process(self): + sampling_params = {'max_new_tokens': 200} + self._run_op(sampling_params=sampling_params, num_proc=2) + def test_vllm(self): - import torch - llm_params = {'tensor_parallel_size': torch.cuda.device_count()} sampling_params = {'max_tokens': 200} - self._run_op(enable_vllm=True, - llm_params=llm_params, - sampling_params=sampling_params) + self._run_op(enable_vllm=True, sampling_params=sampling_params) if __name__ == '__main__': diff --git a/tests/ops/mapper/test_optimize_response_mapper.py b/tests/ops/mapper/test_optimize_response_mapper.py index 6922653ca6..6accc8f296 100644 --- a/tests/ops/mapper/test_optimize_response_mapper.py +++ b/tests/ops/mapper/test_optimize_response_mapper.py @@ -2,6 +2,7 @@ from loguru import logger +from data_juicer.core.data import NestedDataset as Dataset from data_juicer.ops.mapper.optimize_response_mapper import \ OptimizeResponseMapper from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, @@ -13,12 +14,9 @@ @SKIPPED_TESTS.register_module() class OptimizeResponseMapperTest(DataJuicerTestCaseBase): - def _run_op(self, - enable_vllm=False, - llm_params=None, - sampling_params=None): + def _run_op(self, enable_vllm=False, sampling_params=None, num_proc=1): + op = OptimizeResponseMapper(enable_vllm=enable_vllm, - llm_params=llm_params, sampling_params=sampling_params) samples = [{ @@ -26,24 +24,28 @@ def _run_op(self, '鱼香肉丝怎么做?', 'response': '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' + }, { + 'query': '什么是蚂蚁上树?', + 'response': '蚂蚁上树是一道中国菜。' }] + dataset = Dataset.from_list(samples) + results = dataset.map(op.process, num_proc=num_proc, with_rank=True) - for sample in samples: - result = op.process(sample) - logger.info(f'Output results: {result}') - self.assertNotEqual(result['response'], '') + for row in results: + logger.info(f'Output results: {row}') + self.assertNotEqual(row['response'], '') def test(self): sampling_params = {'max_new_tokens': 200} self._run_op(sampling_params=sampling_params) + def test_multi_process(self): + sampling_params = {'max_new_tokens': 200} + self._run_op(sampling_params=sampling_params, num_proc=2) + def test_vllm(self): - import torch - llm_params = {'tensor_parallel_size': torch.cuda.device_count()} sampling_params = {'max_tokens': 200} - self._run_op(enable_vllm=True, - llm_params=llm_params, - sampling_params=sampling_params) + self._run_op(enable_vllm=True, sampling_params=sampling_params) if __name__ == '__main__': From 222512c2bc91dcb731a27359cabbdbe45de41cca Mon Sep 17 00:00:00 2001 From: "gece.gc" Date: Thu, 31 Oct 2024 10:41:33 +0000 Subject: [PATCH 18/23] fix empty history schema --- data_juicer/ops/base_op.py | 4 ++++ data_juicer/ops/mapper/generate_qa_from_examples_mapper.py | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index 918831504a..0bb2d15d38 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -2,6 +2,7 @@ import traceback from functools import wraps +import numpy as np import pyarrow as pa from loguru import logger @@ -215,6 +216,9 @@ def run(self, dataset): dataset = NestedDataset(dataset) return dataset + def empty_history(self): + return np.empty((0, 0), dtype=str) + class Mapper(OP): diff --git a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py index ceaf33d9d9..dd2492f4b8 100644 --- a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py +++ b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py @@ -241,7 +241,7 @@ def process_single(self, sample=None, rank=None): sample.update({ self.query_key: '', self.response_key: '', - self.history_key: [] + self.history_key: self.empty_history() }) return sample @@ -255,9 +255,11 @@ def process_single(self, sample=None, rank=None): if sim_score <= self.similarity_threshold: query, response = output_qa_pairs[-1] history = output_qa_pairs[:-1] + if len(history) == 0: + history = self.empty_history() else: query = response = '' - history = [] + history = self.empty_history() logger.info('Filter this generated sample due to similarity.') sample.update({ From ba4a7889c2b6bd43f121f1e505dde29a4ca7226f Mon Sep 17 00:00:00 2001 From: "gece.gc" Date: Thu, 31 Oct 2024 10:56:36 +0000 Subject: [PATCH 19/23] fix device --- data_juicer/utils/model_utils.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py index f70e204559..6495a3b2fc 100644 --- a/data_juicer/utils/model_utils.py +++ b/data_juicer/utils/model_utils.py @@ -82,20 +82,19 @@ def check_model(model_name, force=False): if os.path.exists(cached_model_path): os.remove(cached_model_path) logger.info( - f'Model [{cached_model_path}] invalid, force to downloading...' - ) + f'Model [{cached_model_path}] is invalid. Forcing download...') else: logger.info( - f'Model [{cached_model_path}] not found. Downloading...') + f'Model [{cached_model_path}] is not found. Downloading...') try: model_link = os.path.join(MODEL_LINKS, model_name) - wget.download(model_link, cached_model_path, bar=None) + wget.download(model_link, cached_model_path) except: # noqa: E722 try: backup_model_link = os.path.join( get_backup_model_link(model_name), model_name) - wget.download(backup_model_link, cached_model_path, bar=None) + wget.download(backup_model_link, cached_model_path) except: # noqa: E722 logger.error( f'Downloading model [{model_name}] error. ' @@ -245,7 +244,7 @@ def prepare_kenlm_model(lang, name_pattern='{}.arpa.bin', **model_params): :param lang: language to render model name :return: model instance. """ - model_params.pop('device') + model_params.pop('device', None) model_name = name_pattern.format(lang) @@ -266,7 +265,7 @@ def prepare_nltk_model(lang, name_pattern='punkt.{}.pickle', **model_params): :param lang: language to render model name :return: model instance. """ - model_params.pop('device') + model_params.pop('device', None) nltk_to_punkt = { 'en': 'english', @@ -316,7 +315,7 @@ def prepare_recognizeAnything_model( pretrained_model_name_or_path, force=True), image_size=input_size, vit='swin_l') - device = model_params.pop('device') or 'cpu' + device = model_params.pop('device', 'cpu') model.to(device).eval() return model From 47f6b8e05eb810844dbd19bfafe7ab1cafe0de07 Mon Sep 17 00:00:00 2001 From: "gece.gc" Date: Thu, 31 Oct 2024 11:17:11 +0000 Subject: [PATCH 20/23] ensure `with_rank` is set properly --- data_juicer/core/data.py | 21 +++++++++++++------ tests/ops/mapper/test_image_tagging_mapper.py | 2 +- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/data_juicer/core/data.py b/data_juicer/core/data.py index 3b41f21484..d31285f9ae 100644 --- a/data_juicer/core/data.py +++ b/data_juicer/core/data.py @@ -246,9 +246,10 @@ def map(self, *args, **kargs): if inspect.ismethod(called_func): # batched is required for fault-tolerant or batched OP - if not called_func.__self__.turbo or hasattr( + if callable(getattr( called_func.__self__, - 'is_batched_op') and called_func.__self__.is_batched_op(): + 'is_batched_op')) and called_func.__self__.is_batched_op( + ) or not called_func.__self__.turbo: kargs['batched'] = True kargs['batch_size'] = kargs.pop('batch_size', 1) if hasattr( called_func.__self__, 'is_batched_op' @@ -256,6 +257,12 @@ def map(self, *args, **kargs): else: kargs['batched'] = False + # rank is required for cuda model loading + if callable( + getattr(called_func.__self__, + 'use_cuda')) and called_func.__self__.use_cuda(): + kargs['with_rank'] = True + if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None: new_fingerprint = generate_fingerprint(self, *args, **kargs) kargs['new_fingerprint'] = new_fingerprint @@ -300,10 +307,12 @@ def filter(self, *args, **kargs): called_func = called_func.__wrapped__ # Batched is always required for fault tolerance - if inspect.ismethod( - called_func) and called_func.__self__.is_batched_op(): - kargs['batched'] = True - kargs['batch_size'] = kargs.pop('batch_size', 1) + if inspect.ismethod(called_func): + if callable(getattr( + called_func.__self__, + 'is_batched_op')) and called_func.__self__.is_batched_op(): + kargs['batched'] = True + kargs['batch_size'] = kargs.pop('batch_size', 1) if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None: new_fingerprint = generate_fingerprint(self, *args, **kargs) diff --git a/tests/ops/mapper/test_image_tagging_mapper.py b/tests/ops/mapper/test_image_tagging_mapper.py index e9609b12f6..ab9fd12b11 100644 --- a/tests/ops/mapper/test_image_tagging_mapper.py +++ b/tests/ops/mapper/test_image_tagging_mapper.py @@ -22,7 +22,7 @@ def _run_image_tagging_mapper(self, target_list, num_proc=1): dataset = Dataset.from_list(source_list) - dataset = dataset.map(op.process, num_proc=num_proc) + dataset = dataset.map(op.process, num_proc=num_proc, with_rank=True) res_list = dataset.to_list() self.assertEqual(res_list, target_list) From da8b25444bb3eb13c78faea10230a167b4d252eb Mon Sep 17 00:00:00 2001 From: "gece.gc" Date: Fri, 1 Nov 2024 03:11:18 +0000 Subject: [PATCH 21/23] fix diffusion model_params --- data_juicer/utils/model_utils.py | 40 ++++++-------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py index 6495a3b2fc..5528914385 100644 --- a/data_juicer/utils/model_utils.py +++ b/data_juicer/utils/model_utils.py @@ -8,7 +8,7 @@ import wget from loguru import logger -from data_juicer import cuda_device_count, is_cuda_available +from data_juicer import cuda_device_count from data_juicer.utils.lazy_loader import AUTOINSTALL, LazyLoader from .cache_utils import DATA_JUICER_MODELS_CACHE as DJMC @@ -104,11 +104,8 @@ def check_model(model_name, force=False): return cached_model_path -def prepare_diffusion_model(pretrained_model_name_or_path, - diffusion_type, - torch_dtype='fp32', - revision='main', - trust_remote_code=False): +def prepare_diffusion_model(pretrained_model_name_or_path, diffusion_type, + **model_params): """ Prepare and load an Diffusion model from HuggingFace. @@ -116,15 +113,13 @@ def prepare_diffusion_model(pretrained_model_name_or_path, or local path to the model :param diffusion_type: the use of the diffusion model. It can be 'image2image', 'text2image', 'inpainting' - :param torch_dtype: the floating point to load the diffusion - model. Can be one of ['fp32', 'fp16', 'bf16'] - :param revision: The specific model version to use. It can be a - branch name, a tag name, a commit id, or any identifier allowed - by Git. :return: a Diffusion model. """ AUTOINSTALL.check(['torch', 'transformers']) + if 'device' in model_params: + model_params['device_map'] = model_params.pop('device') + diffusion_type_to_pipeline = { 'image2image': diffusers.AutoPipelineForImage2Image, 'text2image': diffusers.AutoPipelineForText2Image, @@ -137,30 +132,9 @@ def prepare_diffusion_model(pretrained_model_name_or_path, 'model. Can only be one of ' '["image2image", "text2image", "inpainting"].') - if torch_dtype not in ['fp32', 'fp16', 'bf16']: - raise ValueError( - f'Not support {torch_dtype} torch_dtype for diffusion ' - 'model. Can only be one of ' - '["fp32", "fp16", "bf16"].') - - if not is_cuda_available() and (torch_dtype == 'fp16' - or torch_dtype == 'bf16'): - raise ValueError( - 'In cpu mode, only fp32 torch_dtype can be used for diffusion' - ' model.') - pipeline = diffusion_type_to_pipeline[diffusion_type] - if torch_dtype == 'bf16': - torch_dtype = torch.bfloat16 - elif torch_dtype == 'fp16': - torch_dtype = torch.float16 - else: - torch_dtype = torch.float32 - model = pipeline.from_pretrained(pretrained_model_name_or_path, - revision=revision, - torch_dtype=torch_dtype, - trust_remote_code=trust_remote_code) + **model_params) return model From a265398233c4d209a57ac1d363873bd12fae480b Mon Sep 17 00:00:00 2001 From: "gece.gc" Date: Fri, 1 Nov 2024 08:30:39 +0000 Subject: [PATCH 22/23] minor fix --- configs/config_all.yaml | 2 +- .../generate_qa_from_examples_mapper.py | 29 ++++++++++--------- data_juicer/ops/mapper/optimize_qa_mapper.py | 4 +-- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/configs/config_all.yaml b/configs/config_all.yaml index aa07eeeea4..5023444745 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -65,7 +65,7 @@ process: - generate_qa_from_examples_mapper: # mapper to generate question and answer pairs from examples. hf_model: 'Qwen/Qwen2.5-7B-Instruct' # Model name on huggingface to generate question and answer pairs. seed_file: 'demos/data/demo-dataset-chatml.jsonl' # Path to the seed file in chatml format. - instruct_num: 3 # The number of randomly selected seed examples. + example_num: 3 # The number of randomly selected seed examples. similarity_threshold: 0.7 # the similarity score threshold between the generated samples and the seed examples. Range from 0 to 1. Samples with similarity score less than this threshold will be kept. system_prompt: null # System prompt for guiding the generation task. input_template: null # Template for building the input prompt. diff --git a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py index dd2492f4b8..4d7ff01bdf 100644 --- a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py +++ b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py @@ -43,8 +43,8 @@ class GenerateQAFromExamplesMapper(Mapper): '3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。\n' '4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。\n') - DEFAULT_INPUT_TEMPLATE = '{examples}' - DEFAULT_EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n{qa_pairs}' + DEFAULT_INPUT_TEMPLATE = '{}' + DEFAULT_EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n{}' DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n' DEFAULT_OUTPUT_PATTERN = r'【问题】(.*?)【回答】(.*?)(?=【问题】|$)' @@ -79,11 +79,13 @@ def __init__(self, this threshold will be kept. :param system_prompt: System prompt for guiding the generation task. :param input_template: Template for building the input prompt. It must - include "{examples}", which will be replaced by `example_num` - formatted examples defined by `example_template`. - :param example_template: Template for formatting each QA example. + include one placeholder '{}', which will be replaced by + `example_num` formatted examples defined by `example_template`. + :param example_template: Template for formatting one QA example. It + must include one placeholder '{}', which will be replaced by one + formatted qa_pair. :param qa_pair_template: Template for formatting a single QA pair - within each example. Must include two placeholders "{}" for the + within each example. Must include two placeholders '{}' for the question and answer. :param output_pattern: Regular expression pattern to extract questions and answers from model response. @@ -141,9 +143,6 @@ def __init__(self, self.seed_qa_samples = self._load_seed_qa_samples() if len(self.seed_qa_samples) == 0: raise ValueError('No QA data was parsed from the seed file!') - self.seed_qa_str = [ - self._sample_to_str(sample) for sample in self.seed_qa_samples - ] def _load_seed_qa_samples(self): """Load QA pairs from chatml format file.""" @@ -160,11 +159,13 @@ def _load_seed_qa_samples(self): def _sample_to_str(self, qa_sample): return '\n'.join(['\n'.join(qa_pair) for qa_pair in qa_sample]) + '\n' - def _max_rouge_l_score(self, hypothesis): + def _max_rouge_l_score(self, hypothesis, references): r = rouge.Rouge() max_score = 0.0 - for reference in self.seed_qa_str: - scores = r.get_scores(hypothesis, reference) + hyp_str = self._sample_to_str(hypothesis) + for reference in references: + ref_str = self._sample_to_str(reference) + scores = r.get_scores(hyp_str, ref_str) rouge_l_score = scores[0]['rouge-l']['f'] if rouge_l_score > max_score: max_score = rouge_l_score @@ -246,8 +247,8 @@ def process_single(self, sample=None, rank=None): return sample if self.similarity_type == 'rouge_l': - output_qa_str = self._sample_to_str(output_qa_pairs) - sim_score = self._max_rouge_l_score(output_qa_str) + sim_score = self._max_rouge_l_score(output_qa_pairs, + random_qa_samples) else: raise ValueError( f'Not support similarity type "{self.similarity_type}"!') diff --git a/data_juicer/ops/mapper/optimize_qa_mapper.py b/data_juicer/ops/mapper/optimize_qa_mapper.py index b16288db22..eda705e5fb 100644 --- a/data_juicer/ops/mapper/optimize_qa_mapper.py +++ b/data_juicer/ops/mapper/optimize_qa_mapper.py @@ -51,12 +51,12 @@ def __init__(self, :param hf_model: Hugging Face model ID. :param system_prompt: System prompt for guiding the optimization task. :param input_template: Template for building the input for the model. - Please make sure the template contains one placeholder "{}", which + Please make sure the template contains one placeholder '{}', which corresponds to the question and answer pair generated by param `qa_pair_template`. :param qa_pair_template: Template for formatting the question and answer pair. Please make sure the template contains two - "{}" to format question and answer. + '{}' to format question and answer. :param output_pattern: Regular expression pattern to extract question and answer from model response. :param enable_vllm: Whether to use VLLM for inference acceleration. From c6d514767b47a149349b2882899096100f7b4e5d Mon Sep 17 00:00:00 2001 From: "gece.gc" Date: Mon, 4 Nov 2024 02:20:43 +0000 Subject: [PATCH 23/23] TODO: new OP tests to be checked --- tests/ops/mapper/test_generate_qa_from_examples_mapper.py | 2 +- tests/ops/mapper/test_generate_qa_from_text_mapper.py | 2 +- tests/ops/mapper/test_optimize_qa_mapper.py | 2 +- tests/ops/mapper/test_optimize_query_mapper.py | 2 +- tests/ops/mapper/test_optimize_response_mapper.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/ops/mapper/test_generate_qa_from_examples_mapper.py b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py index 60348af612..2df4f09c0e 100644 --- a/tests/ops/mapper/test_generate_qa_from_examples_mapper.py +++ b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py @@ -8,7 +8,7 @@ DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to ? # These tests have been tested locally. @SKIPPED_TESTS.register_module() class GenerateQAFromExamplesMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_generate_qa_from_text_mapper.py b/tests/ops/mapper/test_generate_qa_from_text_mapper.py index 14834f5d1b..e67285b18d 100644 --- a/tests/ops/mapper/test_generate_qa_from_text_mapper.py +++ b/tests/ops/mapper/test_generate_qa_from_text_mapper.py @@ -9,7 +9,7 @@ DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to ? # These tests have been tested locally. @SKIPPED_TESTS.register_module() class GenerateQAFromTextMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_optimize_qa_mapper.py b/tests/ops/mapper/test_optimize_qa_mapper.py index 0fd21b5867..c937cbf7e6 100644 --- a/tests/ops/mapper/test_optimize_qa_mapper.py +++ b/tests/ops/mapper/test_optimize_qa_mapper.py @@ -8,7 +8,7 @@ DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to ? # These tests have been tested locally. @SKIPPED_TESTS.register_module() class OptimizeQAMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_optimize_query_mapper.py b/tests/ops/mapper/test_optimize_query_mapper.py index 8d6069954f..79fd582676 100644 --- a/tests/ops/mapper/test_optimize_query_mapper.py +++ b/tests/ops/mapper/test_optimize_query_mapper.py @@ -8,7 +8,7 @@ DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to ? # These tests have been tested locally. @SKIPPED_TESTS.register_module() class OptimizeQueryMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_optimize_response_mapper.py b/tests/ops/mapper/test_optimize_response_mapper.py index 6accc8f296..568622d7c2 100644 --- a/tests/ops/mapper/test_optimize_response_mapper.py +++ b/tests/ops/mapper/test_optimize_response_mapper.py @@ -9,7 +9,7 @@ DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to ? # These tests have been tested locally. @SKIPPED_TESTS.register_module() class OptimizeResponseMapperTest(DataJuicerTestCaseBase):