Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,16 @@ process:
- clean_links_mapper: # remove web links from text.
- clean_copyright_mapper: # remove copyright comments.
- expand_macro_mapper: # expand macro definitions in Latex text.
- extract_qa_mapper: # mapper to extract question and answer pair from text.
- generate_qa_from_text_mapper: # mapper to generate question and answer pairs from text.
hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa' # model name on huggingface to extract question and answer pair.
pattern: null # regular expression pattern to search for within text.
qa_format: 'chatml' # Output format of question and answer pair.
enable_vllm: true # Whether to use vllm for inference acceleration.
tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
max_model_len: null # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
max_num_seqs: 256 # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
- fix_unicode_mapper: # fix unicode errors in text.
- generate_instruction_mapper: # generate new instruction text data.
- generate_qa_from_examples_mapper: # mapper to generate question and answer pairs from examples.
hf_model: 'Qwen/Qwen-7B-Chat' # model name on huggingface to generate instruction.
seed_file: 'demos/data/demo-dataset-chatml.jsonl' # Seed file as instruction samples to generate new instructions, chatml format.
instruct_num: 3 # the number of generated samples.
Expand Down Expand Up @@ -146,7 +145,7 @@ process:
delete_random_char: false # whether to open the augmentation method of deleting random characters from the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强"
swap_random_char: false # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法"
replace_equivalent_num: false # whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法"
- optimize_instruction_mapper: # optimize instruction.
- optimize_query_mapper: # optimize instruction query.
hf_model: 'alibaba-pai/Qwen2-7B-Instruct-Refine' # model name on huggingface to optimize instruction
enable_vllm: true # whether to use vllm for inference acceleration.
tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
Expand Down
5 changes: 5 additions & 0 deletions data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ def __init__(self, *args, **kwargs):
self.image_key = kwargs.get('image_key', 'images')
self.audio_key = kwargs.get('audio_key', 'audios')
self.video_key = kwargs.get('video_key', 'videos')

self.query_key = kwargs.get('query_key', 'query')
self.response_key = kwargs.get('response_key', 'response')
self.history_key = kwargs.get('history_key', 'history')

self.batch_size = kwargs.get('batch_size', 1000)

# whether the model can be accelerated using cuda
Expand Down
11 changes: 3 additions & 8 deletions data_juicer/ops/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,7 @@
from .special_characters import SPECIAL_CHARACTERS

__all__ = [
'get_sentences_from_document',
'get_words_from_document',
'merge_on_whitespace_tab_newline',
'split_on_newline_tab_whitespace',
'split_on_whitespace',
'strip',
'words_augmentation',
'words_refinement',
'get_sentences_from_document', 'get_words_from_document',
'merge_on_whitespace_tab_newline', 'split_on_newline_tab_whitespace',
'split_on_whitespace', 'strip', 'words_augmentation', 'words_refinement'
]
7 changes: 4 additions & 3 deletions data_juicer/ops/deduplicator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from .video_deduplicator import VideoDeduplicator

__all__ = [
'VideoDeduplicator', 'RayBasicDeduplicator', 'DocumentMinhashDeduplicator',
'RayImageDeduplicator', 'RayDocumentDeduplicator', 'DocumentDeduplicator',
'ImageDeduplicator', 'DocumentSimhashDeduplicator', 'RayVideoDeduplicator'
'DocumentDeduplicator', 'DocumentMinhashDeduplicator',
'DocumentSimhashDeduplicator', 'ImageDeduplicator', 'RayBasicDeduplicator',
'RayDocumentDeduplicator', 'RayImageDeduplicator', 'RayVideoDeduplicator',
'VideoDeduplicator'
]
63 changes: 18 additions & 45 deletions data_juicer/ops/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,50 +65,23 @@
from .word_repetition_filter import WordRepetitionFilter
from .words_num_filter import WordsNumFilter

# yapf: enable

__all__ = [
'ImageTextSimilarityFilter',
'VideoAspectRatioFilter',
'ImageTextMatchingFilter',
'ImageNSFWFilter',
'TokenNumFilter',
'TextLengthFilter',
'SpecifiedNumericFieldFilter',
'AudioNMFSNRFilter',
'VideoAestheticsFilter',
'PerplexityFilter',
'PhraseGroundingRecallFilter',
'MaximumLineLengthFilter',
'AverageLineLengthFilter',
'SpecifiedFieldFilter',
'VideoTaggingFromFramesFilter',
'TextEntityDependencyFilter',
'VideoResolutionFilter',
'AlphanumericFilter',
'ImageWatermarkFilter',
'ImageAestheticsFilter',
'AudioSizeFilter',
'StopWordsFilter',
'CharacterRepetitionFilter',
'ImageShapeFilter',
'VideoDurationFilter',
'TextActionFilter',
'VideoOcrAreaRatioFilter',
'VideoNSFWFilter',
'SpecialCharactersFilter',
'VideoFramesTextSimilarityFilter',
'ImageAspectRatioFilter',
'AudioDurationFilter',
'LanguageIDScoreFilter',
'SuffixFilter',
'ImageSizeFilter',
'VideoWatermarkFilter',
'WordsNumFilter',
'ImageFaceCountFilter',
'ImageFaceRatioFilter',
'FlaggedWordFilter',
'WordRepetitionFilter',
'VideoMotionScoreFilter',
'ImagePairSimilarityFilter'
'AlphanumericFilter', 'AudioDurationFilter', 'AudioNMFSNRFilter',
'AudioSizeFilter', 'AverageLineLengthFilter', 'CharacterRepetitionFilter',
'FlaggedWordFilter', 'ImageAestheticsFilter', 'ImageAspectRatioFilter',
'ImageFaceCountFilter', 'ImageFaceRatioFilter', 'ImageNSFWFilter',
'ImagePairSimilarityFilter', 'ImageShapeFilter', 'ImageSizeFilter',
'ImageTextMatchingFilter', 'ImageTextSimilarityFilter',
'ImageWatermarkFilter', 'LanguageIDScoreFilter', 'MaximumLineLengthFilter',
'PerplexityFilter', 'PhraseGroundingRecallFilter',
'SpecialCharactersFilter', 'SpecifiedFieldFilter',
'SpecifiedNumericFieldFilter', 'StopWordsFilter', 'SuffixFilter',
'TextActionFilter', 'TextEntityDependencyFilter', 'TextLengthFilter',
'TokenNumFilter', 'VideoAestheticsFilter', 'VideoAspectRatioFilter',
'VideoDurationFilter', 'VideoFramesTextSimilarityFilter',
'VideoMotionScoreFilter', 'VideoNSFWFilter', 'VideoOcrAreaRatioFilter',
'VideoResolutionFilter', 'VideoTaggingFromFramesFilter',
'VideoWatermarkFilter', 'WordRepetitionFilter', 'WordsNumFilter'
]

# yapf: enable
87 changes: 32 additions & 55 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
from . import (audio_ffmpeg_wrapped_mapper, chinese_convert_mapper,
clean_copyright_mapper, clean_email_mapper, clean_html_mapper,
clean_ip_mapper, clean_links_mapper, expand_macro_mapper,
extract_qa_mapper, fix_unicode_mapper,
generate_instruction_mapper, image_blur_mapper,
fix_unicode_mapper, generate_qa_from_examples_mapper,
generate_qa_from_text_mapper, image_blur_mapper,
image_captioning_from_gpt4v_mapper, image_captioning_mapper,
image_diffusion_mapper, image_face_blur_mapper,
image_tagging_mapper, nlpaug_en_mapper, nlpcda_zh_mapper,
optimize_instruction_mapper, punctuation_normalization_mapper,
optimize_qa_mapper, optimize_query_mapper,
optimize_response_mapper, punctuation_normalization_mapper,
remove_bibliography_mapper, remove_comments_mapper,
remove_header_mapper, remove_long_words_mapper,
remove_non_chinese_character_mapper,
Expand All @@ -34,9 +35,9 @@
from .clean_ip_mapper import CleanIpMapper
from .clean_links_mapper import CleanLinksMapper
from .expand_macro_mapper import ExpandMacroMapper
from .extract_qa_mapper import ExtractQAMapper
from .fix_unicode_mapper import FixUnicodeMapper
from .generate_instruction_mapper import GenerateInstructionMapper
from .generate_qa_from_examples_mapper import GenerateQAFromExamplesMapper
from .generate_qa_from_text_mapper import GenerateQAFromTextMapper
from .image_blur_mapper import ImageBlurMapper
from .image_captioning_from_gpt4v_mapper import ImageCaptioningFromGPT4VMapper
from .image_captioning_mapper import ImageCaptioningMapper
Expand All @@ -45,7 +46,9 @@
from .image_tagging_mapper import ImageTaggingMapper
from .nlpaug_en_mapper import NlpaugEnMapper
from .nlpcda_zh_mapper import NlpcdaZhMapper
from .optimize_instruction_mapper import OptimizeInstructionMapper
from .optimize_qa_mapper import OptimizeQAMapper
from .optimize_query_mapper import OptimizeQueryMapper
from .optimize_response_mapper import OptimizeResponseMapper
from .punctuation_normalization_mapper import PunctuationNormalizationMapper
from .remove_bibliography_mapper import RemoveBibliographyMapper
from .remove_comments_mapper import RemoveCommentsMapper
Expand Down Expand Up @@ -78,54 +81,28 @@
from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper
from .whitespace_normalization_mapper import WhitespaceNormalizationMapper

# yapf: enable

__all__ = [
'VideoCaptioningFromAudioMapper',
'VideoTaggingFromAudioMapper',
'ImageCaptioningFromGPT4VMapper',
'PunctuationNormalizationMapper',
'RemoveBibliographyMapper',
'SentenceSplitMapper',
'VideoSplitBySceneMapper',
'CleanIpMapper',
'CleanLinksMapper',
'RemoveHeaderMapper',
'RemoveTableTextMapper',
'VideoRemoveWatermarkMapper',
'RemoveRepeatSentencesMapper',
'ImageDiffusionMapper',
'ImageFaceBlurMapper',
'VideoFFmpegWrappedMapper',
'ChineseConvertMapper',
'NlpcdaZhMapper',
'OptimizeInstructionMapper',
'ImageBlurMapper',
'CleanCopyrightMapper',
'RemoveNonChineseCharacterlMapper',
'VideoSplitByKeyFrameMapper',
'RemoveSpecificCharsMapper',
'VideoResizeAspectRatioMapper',
'CleanHtmlMapper',
'WhitespaceNormalizationMapper',
'VideoTaggingFromFramesMapper',
'RemoveCommentsMapper',
'ExpandMacroMapper',
'ExtractQAMapper',
'ImageCaptioningMapper',
'RemoveWordsWithIncorrectSubstringsMapper',
'VideoCaptioningFromVideoMapper',
'VideoCaptioningFromSummarizerMapper',
'GenerateInstructionMapper',
'FixUnicodeMapper',
'NlpaugEnMapper',
'VideoCaptioningFromFramesMapper',
'RemoveLongWordsMapper',
'VideoResizeResolutionMapper',
'CleanEmailMapper',
'ReplaceContentMapper',
'AudioFFmpegWrappedMapper',
'VideoSplitByDurationMapper',
'VideoFaceBlurMapper',
'ImageTaggingMapper',
'AudioFFmpegWrappedMapper', 'ChineseConvertMapper', 'CleanCopyrightMapper',
'CleanEmailMapper', 'CleanHtmlMapper', 'CleanIpMapper', 'CleanLinksMapper',
'ExpandMacroMapper', 'FixUnicodeMapper', 'GenerateQAFromExamplesMapper',
'GenerateQAFromTextMapper', 'ImageBlurMapper',
'ImageCaptioningFromGPT4VMapper', 'ImageCaptioningMapper',
'ImageDiffusionMapper', 'ImageFaceBlurMapper', 'ImageTaggingMapper',
'NlpaugEnMapper', 'NlpcdaZhMapper', 'OptimizeQAMapper',
'OptimizeQueryMapper', 'OptimizeResponseMapper',
'PunctuationNormalizationMapper', 'RemoveBibliographyMapper',
'RemoveCommentsMapper', 'RemoveHeaderMapper', 'RemoveLongWordsMapper',
'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper',
'RemoveSpecificCharsMapper', 'RemoveTableTextMapper',
'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper',
'SentenceSplitMapper', 'VideoCaptioningFromAudioMapper',
'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper',
'VideoCaptioningFromVideoMapper', 'VideoFFmpegWrappedMapper',
'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper',
'VideoResizeAspectRatioMapper', 'VideoResizeResolutionMapper',
'VideoSplitByDurationMapper', 'VideoSplitByKeyFrameMapper',
'VideoSplitBySceneMapper', 'VideoTaggingFromAudioMapper',
'VideoTaggingFromFramesMapper', 'WhitespaceNormalizationMapper'
]

# yapf: enable
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@
EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n\n{qa_pairs}'
QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n'

OP_NAME = 'generate_instruction_mapper'
OP_NAME = 'generate_qa_from_examples_mapper'


# TODO: Extend LLM-based OPs into API-based implementation.
@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class GenerateInstructionMapper(Mapper):
"""Mapper to generate new instruction text data.
class GenerateQAFromExamplesMapper(Mapper):
"""
Mapper to generate question and answer pairs from examples.
You should configure an empty dataset in your yaml config file:
```
generated_dataset_config:
Expand Down Expand Up @@ -211,11 +212,12 @@ def parse_chatml_str(self, input_str):
return qa_pairs

def parse_response(self, response_str):
logger.debug(response_str)
pattern = self.qa_extraction_pattern
matches = re.findall(pattern, response_str, re.DOTALL)
response_str = ''
out_qa_pairs = []
for i, match in enumerate(matches):
for match in matches:
question, answer = match
question = question.strip()
answer = answer.strip()
Expand Down Expand Up @@ -257,11 +259,14 @@ def process_single(self, sample=None, rank=None):
output_ids = output_ids[:, inputs.data['input_ids'].shape[1]:]
response_str = processor.decode(output_ids.cpu()[0],
skip_special_tokens=True)
message_list = []
out_qa_pairs, response_str = self.parse_response(response_str)

if not response_str:
return {self.text_key: json.dumps({'messages': message_list})}
return {
self.query_key: '',
self.response_key: '',
self.history_key: []
}

if self.similarity_type == 'rouge_l':
sim_score = self.max_rouge_l_score(response_str,
Expand All @@ -271,13 +276,15 @@ def process_single(self, sample=None, rank=None):
f'Not support similarity type "{self.similarity_type}"!')

if sim_score <= self.similarity_threshold:
for question, answer in out_qa_pairs:
message_list.append({'role': 'user', 'content': question})
message_list.append({'role': 'assistant', 'content': answer})
query, response = out_qa_pairs[-1]
history = out_qa_pairs[:-1]
else:
query = response = ''
history = []
logger.info('Filter this generated sample due to similarity.')

return {
self.text_key:
json.dumps({'messages': message_list}, ensure_ascii=False)
self.query_key: query,
self.response_key: response,
self.history_key: history
}
Loading
Loading