Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,18 @@ hpo_config: null # path to a configur
process:
# Mapper ops. Most of these ops need no arguments.
- audio_ffmpeg_wrapped_mapper: # simple wrapper for FFmpeg audio filters
- calibrate_qa_mapper: # calibrate question-answer pairs based on reference text.
api_url: # API URL. Defaults to DJ_API_URL environment variable.
api_key: # API key. Defaults to DJ_API_KEY environment variable.
response_path: # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: # System prompt for the calibration task.
input_template: # Template for building the model input.
reference_template: # Template for formatting the reference text.
qa_pair_template: # Template for formatting question-answer pairs.
output_pattern: # Regular expression for parsing model output.
api_params: # Extra parameters passed to the API call.
- calibrate_query_mapper: # calibrate query in question-answer pairs based on reference text.
- calibrate_response_mapper: # calibrate response in question-answer pairs based on reference text.
- chinese_convert_mapper: # convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji.
mode: 's2t' # choose the mode to convert Chinese: ['s2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t']
- clean_email_mapper: # remove emails from text.
Expand Down
5 changes: 5 additions & 0 deletions data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ def __init__(self, *args, **kwargs):
self.image_key = kwargs.get('image_key', 'images')
self.audio_key = kwargs.get('audio_key', 'audios')
self.video_key = kwargs.get('video_key', 'videos')

self.query_key = kwargs.get('query_key', 'query')
self.response_key = kwargs.get('response_key', 'response')
self.history_key = kwargs.get('history_key', 'history')

self.batch_size = kwargs.get('batch_size', 1000)

# whether the model can be accelerated using cuda
Expand Down
7 changes: 4 additions & 3 deletions data_juicer/ops/deduplicator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from .video_deduplicator import VideoDeduplicator

__all__ = [
'VideoDeduplicator', 'RayBasicDeduplicator', 'DocumentMinhashDeduplicator',
'RayImageDeduplicator', 'RayDocumentDeduplicator', 'DocumentDeduplicator',
'ImageDeduplicator', 'DocumentSimhashDeduplicator', 'RayVideoDeduplicator'
'DocumentDeduplicator', 'DocumentMinhashDeduplicator',
'DocumentSimhashDeduplicator', 'ImageDeduplicator', 'RayBasicDeduplicator',
'RayDocumentDeduplicator', 'RayImageDeduplicator', 'RayVideoDeduplicator',
'VideoDeduplicator'
]
33 changes: 16 additions & 17 deletions data_juicer/ops/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,21 +44,20 @@
from .words_num_filter import WordsNumFilter

__all__ = [
'ImageTextSimilarityFilter', 'VideoAspectRatioFilter',
'ImageTextMatchingFilter', 'ImageNSFWFilter', 'TokenNumFilter',
'TextLengthFilter', 'SpecifiedNumericFieldFilter', 'AudioNMFSNRFilter',
'VideoAestheticsFilter', 'PerplexityFilter', 'PhraseGroundingRecallFilter',
'MaximumLineLengthFilter', 'AverageLineLengthFilter',
'SpecifiedFieldFilter', 'VideoTaggingFromFramesFilter',
'TextEntityDependencyFilter', 'VideoResolutionFilter',
'AlphanumericFilter', 'ImageWatermarkFilter', 'ImageAestheticsFilter',
'AudioSizeFilter', 'StopWordsFilter', 'CharacterRepetitionFilter',
'ImageShapeFilter', 'VideoDurationFilter', 'TextActionFilter',
'VideoOcrAreaRatioFilter', 'VideoNSFWFilter', 'SpecialCharactersFilter',
'VideoFramesTextSimilarityFilter', 'ImageAspectRatioFilter',
'AudioDurationFilter', 'LanguageIDScoreFilter', 'SuffixFilter',
'ImageSizeFilter', 'VideoWatermarkFilter', 'WordsNumFilter',
'ImageFaceCountFilter', 'ImageFaceRatioFilter', 'FlaggedWordFilter',
'WordRepetitionFilter', 'VideoMotionScoreFilter',
'ImagePairSimilarityFilter'
'AlphanumericFilter', 'AudioDurationFilter', 'AudioNMFSNRFilter',
'AudioSizeFilter', 'AverageLineLengthFilter', 'CharacterRepetitionFilter',
'FlaggedWordFilter', 'ImageAestheticsFilter', 'ImageAspectRatioFilter',
'ImageFaceCountFilter', 'ImageFaceRatioFilter', 'ImageNSFWFilter',
'ImagePairSimilarityFilter', 'ImageShapeFilter', 'ImageSizeFilter',
'ImageTextMatchingFilter', 'ImageTextSimilarityFilter',
'ImageWatermarkFilter', 'LanguageIDScoreFilter', 'MaximumLineLengthFilter',
'PerplexityFilter', 'PhraseGroundingRecallFilter',
'SpecialCharactersFilter', 'SpecifiedFieldFilter',
'SpecifiedNumericFieldFilter', 'StopWordsFilter', 'SuffixFilter',
'TextActionFilter', 'TextEntityDependencyFilter', 'TextLengthFilter',
'TokenNumFilter', 'VideoAestheticsFilter', 'VideoAspectRatioFilter',
'VideoDurationFilter', 'VideoFramesTextSimilarityFilter',
'VideoMotionScoreFilter', 'VideoNSFWFilter', 'VideoOcrAreaRatioFilter',
'VideoResolutionFilter', 'VideoTaggingFromFramesFilter',
'VideoWatermarkFilter', 'WordRepetitionFilter', 'WordsNumFilter'
]
2 changes: 1 addition & 1 deletion data_juicer/ops/filter/image_pair_similarity_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(self,
*args,
**kwargs):
"""
Initialization method.
Initialization method.

:param hf_clip: clip model name on huggingface to compute
the similarity between image and text.
Expand Down
71 changes: 24 additions & 47 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from .audio_ffmpeg_wrapped_mapper import AudioFFmpegWrappedMapper
from .calibrate_qa_mapper import CalibrateQAMapper
from .calibrate_query_mapper import CalibrateQueryMapper
from .calibrate_response_mapper import CalibrateResponseMapper
from .chinese_convert_mapper import ChineseConvertMapper
from .clean_copyright_mapper import CleanCopyrightMapper
from .clean_email_mapper import CleanEmailMapper
Expand Down Expand Up @@ -51,51 +54,25 @@
from .whitespace_normalization_mapper import WhitespaceNormalizationMapper

__all__ = [
'VideoCaptioningFromAudioMapper',
'VideoTaggingFromAudioMapper',
'ImageCaptioningFromGPT4VMapper',
'PunctuationNormalizationMapper',
'RemoveBibliographyMapper',
'SentenceSplitMapper',
'VideoSplitBySceneMapper',
'CleanIpMapper',
'CleanLinksMapper',
'RemoveHeaderMapper',
'RemoveTableTextMapper',
'VideoRemoveWatermarkMapper',
'RemoveRepeatSentencesMapper',
'ImageDiffusionMapper',
'ImageFaceBlurMapper',
'VideoFFmpegWrappedMapper',
'ChineseConvertMapper',
'NlpcdaZhMapper',
'OptimizeInstructionMapper',
'ImageBlurMapper',
'CleanCopyrightMapper',
'RemoveNonChineseCharacterlMapper',
'VideoSplitByKeyFrameMapper',
'RemoveSpecificCharsMapper',
'VideoResizeAspectRatioMapper',
'CleanHtmlMapper',
'WhitespaceNormalizationMapper',
'VideoTaggingFromFramesMapper',
'RemoveCommentsMapper',
'ExpandMacroMapper',
'ExtractQAMapper',
'ImageCaptioningMapper',
'RemoveWordsWithIncorrectSubstringsMapper',
'VideoCaptioningFromVideoMapper',
'VideoCaptioningFromSummarizerMapper',
'GenerateInstructionMapper',
'FixUnicodeMapper',
'NlpaugEnMapper',
'VideoCaptioningFromFramesMapper',
'RemoveLongWordsMapper',
'VideoResizeResolutionMapper',
'CleanEmailMapper',
'ReplaceContentMapper',
'AudioFFmpegWrappedMapper',
'VideoSplitByDurationMapper',
'VideoFaceBlurMapper',
'ImageTaggingMapper',
'AudioFFmpegWrappedMapper', 'CalibrateQAMapper', 'CalibrateQueryMapper',
'CalibrateResponseMapper', 'ChineseConvertMapper', 'CleanCopyrightMapper',
'CleanEmailMapper', 'CleanHtmlMapper', 'CleanIpMapper', 'CleanLinksMapper',
'ExpandMacroMapper', 'ExtractQAMapper', 'FixUnicodeMapper',
'GenerateInstructionMapper', 'ImageBlurMapper',
'ImageCaptioningFromGPT4VMapper', 'ImageCaptioningMapper',
'ImageDiffusionMapper', 'ImageFaceBlurMapper', 'ImageTaggingMapper',
'NlpaugEnMapper', 'NlpcdaZhMapper', 'OptimizeInstructionMapper',
'PunctuationNormalizationMapper', 'RemoveBibliographyMapper',
'RemoveCommentsMapper', 'RemoveHeaderMapper', 'RemoveLongWordsMapper',
'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper',
'RemoveSpecificCharsMapper', 'RemoveTableTextMapper',
'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper',
'SentenceSplitMapper', 'VideoCaptioningFromAudioMapper',
'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper',
'VideoCaptioningFromVideoMapper', 'VideoFFmpegWrappedMapper',
'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper',
'VideoResizeAspectRatioMapper', 'VideoResizeResolutionMapper',
'VideoSplitByDurationMapper', 'VideoSplitByKeyFrameMapper',
'VideoSplitBySceneMapper', 'VideoTaggingFromAudioMapper',
'VideoTaggingFromFramesMapper', 'WhitespaceNormalizationMapper'
]
109 changes: 109 additions & 0 deletions data_juicer/ops/mapper/calibrate_qa_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import re
from typing import Dict, Optional

from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
from data_juicer.utils.model_utils import get_model, prepare_model

OP_NAME = 'calibrate_qa_mapper'


# TODO: LLM-based inference.
@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class CalibrateQAMapper(Mapper):
"""
Mapper to calibrate question-answer pairs based on reference text.
"""

# avoid leading whitespace
DEFAULT_SYSTEM_PROMPT = ('请根据提供的【参考信息】对【问题】和【回答】进行校准,使其更加详细、准确。\n'
'按照以下格式输出:\n'
'【问题】\n'
'校准后的问题\n'
'【回答】\n'
'校准后的回答')
DEFAULT_INPUT_TEMPLATE = '{reference}\n{qa_pair}'
DEFAULT_REFERENCE_TEMPLATE = '【参考信息】\n{}'
DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}'
DEFAULT_OUTPUT_PATTERN = r'【问题】\s*(.*?)\s*【回答】\s*(.*)'

def __init__(self,
api_model: str = 'gpt-4o',
*,
api_url: Optional[str] = None,
api_key: Optional[str] = None,
response_path: Optional[str] = None,
system_prompt: Optional[str] = None,
input_template: Optional[str] = None,
reference_template: Optional[str] = None,
qa_pair_template: Optional[str] = None,
output_pattern: Optional[str] = None,
api_params: Optional[Dict] = None,
**kwargs):
"""
Initialization method.
:param api_model: API model name.
:param api_url: API URL. Defaults to DJ_API_URL environment variable.
:param api_key: API key. Defaults to DJ_API_KEY environment variable.
:param response_path: Path to extract content from the API response.
Defaults to 'choices.0.message.content'.
:param system_prompt: System prompt for the calibration task.
:param input_template: Template for building the model input.
:param reference_template: Template for formatting the reference text.
:param qa_pair_template: Template for formatting question-answer pairs.
:param output_pattern: Regular expression for parsing model output.
:param api_params: Extra parameters passed to the API call.
:param kwargs: Extra keyword arguments.
"""
super().__init__(**kwargs)

self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE
self.reference_template = reference_template or \
self.DEFAULT_REFERENCE_TEMPLATE
self.qa_pair_template = qa_pair_template or \
self.DEFAULT_QA_PAIR_TEMPLATE
self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN

self.api_params = api_params or {}
self.model_key = prepare_model(model_type='api',
api_model=api_model,
api_url=api_url,
api_key=api_key,
response_path=response_path)

def build_input(self, sample):
reference = self.reference_template.format(sample[self.text_key])
qa_pair = self.qa_pair_template.format(sample[self.query_key],
sample[self.response_key])
input_prompt = self.input_template.format(reference=reference,
qa_pair=qa_pair)
return input_prompt

def parse_output(self, raw_output):
match = re.match(self.output_pattern, raw_output)
if match:
return match.group(1).strip(), match.group(2).strip()
else:
return None, None

def process_single(self, sample=None, rank=None):
client = get_model(self.model_key, rank=rank)

messages = [{
'role': 'system',
'content': self.system_prompt
}, {
'role': 'user',
'content': self.build_input(sample)
}]
output = client(messages, **self.api_params)

parsed_q, parsed_a = self.parse_output(output)
if parsed_q:
sample[self.query_key] = parsed_q
if parsed_a:
sample[self.response_key] = parsed_a

return sample
19 changes: 19 additions & 0 deletions data_juicer/ops/mapper/calibrate_query_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper

OP_NAME = 'calibrate_query_mapper'


# TODO: LLM-based inference.
@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class CalibrateQueryMapper(CalibrateQAMapper):
"""
Mapper to calibrate query in question-answer pairs based on reference text.
"""

DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【问题】进行校准,\
使其更加详细、准确,且仍可以由原答案回答。只输出校准后的问题,不要输出多余内容。'

def parse_output(self, raw_output):
return raw_output.strip(), None
19 changes: 19 additions & 0 deletions data_juicer/ops/mapper/calibrate_response_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper

OP_NAME = 'calibrate_response_mapper'


# TODO: LLM-based inference.
@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class CalibrateResponseMapper(CalibrateQAMapper):
"""
Mapper to calibrate response in question-answer pairs based on reference text.
""" # noqa: E501

DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【回答】进行校准,\
使其更加详细、准确,且仍可以回答原问题。只输出校准后的回答,不要输出多余内容。'

def parse_output(self, raw_output):
return None, raw_output.strip()
Loading
Loading