modelscope · drcege · Nov 7, 2024 · Oct 23, 2024 · Oct 24, 2024 · Oct 29, 2024
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -53,6 +53,18 @@ hpo_config: null                                            # path to a configur
 process:
   # Mapper ops. Most of these ops need no arguments.
   - audio_ffmpeg_wrapped_mapper:                            # simple wrapper for FFmpeg audio filters
+  - calibrate_qa_mapper:                                    # calibrate question-answer pairs based on reference text.
+      api_url:                                                # API URL. Defaults to DJ_API_URL environment variable.
+      api_key:                                                # API key. Defaults to DJ_API_KEY environment variable.
+      response_path:                                          # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt:                                          # System prompt for the calibration task.
+      input_template:                                         # Template for building the model input.
+      reference_template:                                     # Template for formatting the reference text.
+      qa_pair_template:                                       # Template for formatting question-answer pairs.
+      output_pattern:                                         # Regular expression for parsing model output.
+      api_params:                                             # Extra parameters passed to the API call.
+  - calibrate_query_mapper:                                 # calibrate query in question-answer pairs based on reference text.
+  - calibrate_response_mapper:                              # calibrate response in question-answer pairs based on reference text.
   - chinese_convert_mapper:                                 # convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji.
       mode: 's2t'                                             # choose the mode to convert Chinese: ['s2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t']
   - clean_email_mapper:                                     # remove emails from text.

diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
@@ -133,6 +133,11 @@ def __init__(self, *args, **kwargs):
         self.image_key = kwargs.get('image_key', 'images')
         self.audio_key = kwargs.get('audio_key', 'audios')
         self.video_key = kwargs.get('video_key', 'videos')
+
+        self.query_key = kwargs.get('query_key', 'query')
+        self.response_key = kwargs.get('response_key', 'response')
+        self.history_key = kwargs.get('history_key', 'history')
+
         self.batch_size = kwargs.get('batch_size', 1000)
 
         # whether the model can be accelerated using cuda

diff --git a/data_juicer/ops/deduplicator/__init__.py b/data_juicer/ops/deduplicator/__init__.py
@@ -9,7 +9,8 @@
 from .video_deduplicator import VideoDeduplicator
 
 __all__ = [
-    'VideoDeduplicator', 'RayBasicDeduplicator', 'DocumentMinhashDeduplicator',
-    'RayImageDeduplicator', 'RayDocumentDeduplicator', 'DocumentDeduplicator',
-    'ImageDeduplicator', 'DocumentSimhashDeduplicator', 'RayVideoDeduplicator'
+    'DocumentDeduplicator', 'DocumentMinhashDeduplicator',
+    'DocumentSimhashDeduplicator', 'ImageDeduplicator', 'RayBasicDeduplicator',
+    'RayDocumentDeduplicator', 'RayImageDeduplicator', 'RayVideoDeduplicator',
+    'VideoDeduplicator'
 ]
diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py
@@ -44,21 +44,20 @@
 from .words_num_filter import WordsNumFilter
 
 __all__ = [
-    'ImageTextSimilarityFilter', 'VideoAspectRatioFilter',
-    'ImageTextMatchingFilter', 'ImageNSFWFilter', 'TokenNumFilter',
-    'TextLengthFilter', 'SpecifiedNumericFieldFilter', 'AudioNMFSNRFilter',
-    'VideoAestheticsFilter', 'PerplexityFilter', 'PhraseGroundingRecallFilter',
-    'MaximumLineLengthFilter', 'AverageLineLengthFilter',
-    'SpecifiedFieldFilter', 'VideoTaggingFromFramesFilter',
-    'TextEntityDependencyFilter', 'VideoResolutionFilter',
-    'AlphanumericFilter', 'ImageWatermarkFilter', 'ImageAestheticsFilter',
-    'AudioSizeFilter', 'StopWordsFilter', 'CharacterRepetitionFilter',
-    'ImageShapeFilter', 'VideoDurationFilter', 'TextActionFilter',
-    'VideoOcrAreaRatioFilter', 'VideoNSFWFilter', 'SpecialCharactersFilter',
-    'VideoFramesTextSimilarityFilter', 'ImageAspectRatioFilter',
-    'AudioDurationFilter', 'LanguageIDScoreFilter', 'SuffixFilter',
-    'ImageSizeFilter', 'VideoWatermarkFilter', 'WordsNumFilter',
-    'ImageFaceCountFilter', 'ImageFaceRatioFilter', 'FlaggedWordFilter',
-    'WordRepetitionFilter', 'VideoMotionScoreFilter',
-    'ImagePairSimilarityFilter'
+    'AlphanumericFilter', 'AudioDurationFilter', 'AudioNMFSNRFilter',
+    'AudioSizeFilter', 'AverageLineLengthFilter', 'CharacterRepetitionFilter',
+    'FlaggedWordFilter', 'ImageAestheticsFilter', 'ImageAspectRatioFilter',
+    'ImageFaceCountFilter', 'ImageFaceRatioFilter', 'ImageNSFWFilter',
+    'ImagePairSimilarityFilter', 'ImageShapeFilter', 'ImageSizeFilter',
+    'ImageTextMatchingFilter', 'ImageTextSimilarityFilter',
+    'ImageWatermarkFilter', 'LanguageIDScoreFilter', 'MaximumLineLengthFilter',
+    'PerplexityFilter', 'PhraseGroundingRecallFilter',
+    'SpecialCharactersFilter', 'SpecifiedFieldFilter',
+    'SpecifiedNumericFieldFilter', 'StopWordsFilter', 'SuffixFilter',
+    'TextActionFilter', 'TextEntityDependencyFilter', 'TextLengthFilter',
+    'TokenNumFilter', 'VideoAestheticsFilter', 'VideoAspectRatioFilter',
+    'VideoDurationFilter', 'VideoFramesTextSimilarityFilter',
+    'VideoMotionScoreFilter', 'VideoNSFWFilter', 'VideoOcrAreaRatioFilter',
+    'VideoResolutionFilter', 'VideoTaggingFromFramesFilter',
+    'VideoWatermarkFilter', 'WordRepetitionFilter', 'WordsNumFilter'
 ]
diff --git a/data_juicer/ops/filter/image_pair_similarity_filter.py b/data_juicer/ops/filter/image_pair_similarity_filter.py
@@ -30,7 +30,7 @@ def __init__(self,
                  *args,
                  **kwargs):
         """
-    Initialization method.
+        Initialization method.
 
         :param hf_clip: clip model name on huggingface to compute
             the similarity between image and text.

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -1,4 +1,7 @@
 from .audio_ffmpeg_wrapped_mapper import AudioFFmpegWrappedMapper
+from .calibrate_qa_mapper import CalibrateQAMapper
+from .calibrate_query_mapper import CalibrateQueryMapper
+from .calibrate_response_mapper import CalibrateResponseMapper
 from .chinese_convert_mapper import ChineseConvertMapper
 from .clean_copyright_mapper import CleanCopyrightMapper
 from .clean_email_mapper import CleanEmailMapper
@@ -51,51 +54,25 @@
 from .whitespace_normalization_mapper import WhitespaceNormalizationMapper
 
 __all__ = [
-    'VideoCaptioningFromAudioMapper',
-    'VideoTaggingFromAudioMapper',
-    'ImageCaptioningFromGPT4VMapper',
-    'PunctuationNormalizationMapper',
-    'RemoveBibliographyMapper',
-    'SentenceSplitMapper',
-    'VideoSplitBySceneMapper',
-    'CleanIpMapper',
-    'CleanLinksMapper',
-    'RemoveHeaderMapper',
-    'RemoveTableTextMapper',
-    'VideoRemoveWatermarkMapper',
-    'RemoveRepeatSentencesMapper',
-    'ImageDiffusionMapper',
-    'ImageFaceBlurMapper',
-    'VideoFFmpegWrappedMapper',
-    'ChineseConvertMapper',
-    'NlpcdaZhMapper',
-    'OptimizeInstructionMapper',
-    'ImageBlurMapper',
-    'CleanCopyrightMapper',
-    'RemoveNonChineseCharacterlMapper',
-    'VideoSplitByKeyFrameMapper',
-    'RemoveSpecificCharsMapper',
-    'VideoResizeAspectRatioMapper',
-    'CleanHtmlMapper',
-    'WhitespaceNormalizationMapper',
-    'VideoTaggingFromFramesMapper',
-    'RemoveCommentsMapper',
-    'ExpandMacroMapper',
-    'ExtractQAMapper',
-    'ImageCaptioningMapper',
-    'RemoveWordsWithIncorrectSubstringsMapper',
-    'VideoCaptioningFromVideoMapper',
-    'VideoCaptioningFromSummarizerMapper',
-    'GenerateInstructionMapper',
-    'FixUnicodeMapper',
-    'NlpaugEnMapper',
-    'VideoCaptioningFromFramesMapper',
-    'RemoveLongWordsMapper',
-    'VideoResizeResolutionMapper',
-    'CleanEmailMapper',
-    'ReplaceContentMapper',
-    'AudioFFmpegWrappedMapper',
-    'VideoSplitByDurationMapper',
-    'VideoFaceBlurMapper',
-    'ImageTaggingMapper',
+    'AudioFFmpegWrappedMapper', 'CalibrateQAMapper', 'CalibrateQueryMapper',
+    'CalibrateResponseMapper', 'ChineseConvertMapper', 'CleanCopyrightMapper',
+    'CleanEmailMapper', 'CleanHtmlMapper', 'CleanIpMapper', 'CleanLinksMapper',
+    'ExpandMacroMapper', 'ExtractQAMapper', 'FixUnicodeMapper',
+    'GenerateInstructionMapper', 'ImageBlurMapper',
+    'ImageCaptioningFromGPT4VMapper', 'ImageCaptioningMapper',
+    'ImageDiffusionMapper', 'ImageFaceBlurMapper', 'ImageTaggingMapper',
+    'NlpaugEnMapper', 'NlpcdaZhMapper', 'OptimizeInstructionMapper',
+    'PunctuationNormalizationMapper', 'RemoveBibliographyMapper',
+    'RemoveCommentsMapper', 'RemoveHeaderMapper', 'RemoveLongWordsMapper',
+    'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper',
+    'RemoveSpecificCharsMapper', 'RemoveTableTextMapper',
+    'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper',
+    'SentenceSplitMapper', 'VideoCaptioningFromAudioMapper',
+    'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper',
+    'VideoCaptioningFromVideoMapper', 'VideoFFmpegWrappedMapper',
+    'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper',
+    'VideoResizeAspectRatioMapper', 'VideoResizeResolutionMapper',
+    'VideoSplitByDurationMapper', 'VideoSplitByKeyFrameMapper',
+    'VideoSplitBySceneMapper', 'VideoTaggingFromAudioMapper',
+    'VideoTaggingFromFramesMapper', 'WhitespaceNormalizationMapper'
 ]
diff --git a/data_juicer/ops/mapper/calibrate_qa_mapper.py b/data_juicer/ops/mapper/calibrate_qa_mapper.py
@@ -0,0 +1,109 @@
+import re
+from typing import Dict, Optional
+
+from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
+from data_juicer.utils.model_utils import get_model, prepare_model
+
+OP_NAME = 'calibrate_qa_mapper'
+
+
+# TODO: LLM-based inference.
+@UNFORKABLE.register_module(OP_NAME)
+@OPERATORS.register_module(OP_NAME)
+class CalibrateQAMapper(Mapper):
+    """
+    Mapper to calibrate question-answer pairs based on reference text.
+    """
+
+    # avoid leading whitespace
+    DEFAULT_SYSTEM_PROMPT = ('请根据提供的【参考信息】对【问题】和【回答】进行校准，使其更加详细、准确。\n'
+                             '按照以下格式输出：\n'
+                             '【问题】\n'
+                             '校准后的问题\n'
+                             '【回答】\n'
+                             '校准后的回答')
+    DEFAULT_INPUT_TEMPLATE = '{reference}\n{qa_pair}'
+    DEFAULT_REFERENCE_TEMPLATE = '【参考信息】\n{}'
+    DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}'
+    DEFAULT_OUTPUT_PATTERN = r'【问题】\s*(.*?)\s*【回答】\s*(.*)'
+
+    def __init__(self,
+                 api_model: str = 'gpt-4o',
+                 *,
+                 api_url: Optional[str] = None,
+                 api_key: Optional[str] = None,
+                 response_path: Optional[str] = None,
+                 system_prompt: Optional[str] = None,
+                 input_template: Optional[str] = None,
+                 reference_template: Optional[str] = None,
+                 qa_pair_template: Optional[str] = None,
+                 output_pattern: Optional[str] = None,
+                 api_params: Optional[Dict] = None,
+                 **kwargs):
+        """
+        Initialization method.
+
+        :param api_model: API model name.
+        :param api_url: API URL. Defaults to DJ_API_URL environment variable.
+        :param api_key: API key. Defaults to DJ_API_KEY environment variable.
+        :param response_path: Path to extract content from the API response.
+            Defaults to 'choices.0.message.content'.
+        :param system_prompt: System prompt for the calibration task.
+        :param input_template: Template for building the model input.
+        :param reference_template: Template for formatting the reference text.
+        :param qa_pair_template: Template for formatting question-answer pairs.
+        :param output_pattern: Regular expression for parsing model output.
+        :param api_params: Extra parameters passed to the API call.
+        :param kwargs: Extra keyword arguments.
+        """
+        super().__init__(**kwargs)
+
+        self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
+        self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE
+        self.reference_template = reference_template or \
+            self.DEFAULT_REFERENCE_TEMPLATE
+        self.qa_pair_template = qa_pair_template or \
+            self.DEFAULT_QA_PAIR_TEMPLATE
+        self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN
+
+        self.api_params = api_params or {}
+        self.model_key = prepare_model(model_type='api',
+                                       api_model=api_model,
+                                       api_url=api_url,
+                                       api_key=api_key,
+                                       response_path=response_path)
+
+    def build_input(self, sample):
+        reference = self.reference_template.format(sample[self.text_key])
+        qa_pair = self.qa_pair_template.format(sample[self.query_key],
+                                               sample[self.response_key])
+        input_prompt = self.input_template.format(reference=reference,
+                                                  qa_pair=qa_pair)
+        return input_prompt
+
+    def parse_output(self, raw_output):
+        match = re.match(self.output_pattern, raw_output)
+        if match:
+            return match.group(1).strip(), match.group(2).strip()
+        else:
+            return None, None
+
+    def process_single(self, sample=None, rank=None):
+        client = get_model(self.model_key, rank=rank)
+
+        messages = [{
+            'role': 'system',
+            'content': self.system_prompt
+        }, {
+            'role': 'user',
+            'content': self.build_input(sample)
+        }]
+        output = client(messages, **self.api_params)
+
+        parsed_q, parsed_a = self.parse_output(output)
+        if parsed_q:
+            sample[self.query_key] = parsed_q
+        if parsed_a:
+            sample[self.response_key] = parsed_a
+
+        return sample
diff --git a/data_juicer/ops/mapper/calibrate_query_mapper.py b/data_juicer/ops/mapper/calibrate_query_mapper.py
@@ -0,0 +1,19 @@
+from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
+from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper
+
+OP_NAME = 'calibrate_query_mapper'
+
+
+# TODO: LLM-based inference.
+@UNFORKABLE.register_module(OP_NAME)
+@OPERATORS.register_module(OP_NAME)
+class CalibrateQueryMapper(CalibrateQAMapper):
+    """
+    Mapper to calibrate query in question-answer pairs based on reference text.
+    """
+
+    DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【问题】进行校准，\
+        使其更加详细、准确，且仍可以由原答案回答。只输出校准后的问题，不要输出多余内容。'
+
+    def parse_output(self, raw_output):
+        return raw_output.strip(), None
diff --git a/data_juicer/ops/mapper/calibrate_response_mapper.py b/data_juicer/ops/mapper/calibrate_response_mapper.py
@@ -0,0 +1,19 @@
+from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
+from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper
+
+OP_NAME = 'calibrate_response_mapper'
+
+
+# TODO: LLM-based inference.
+@UNFORKABLE.register_module(OP_NAME)
+@OPERATORS.register_module(OP_NAME)
+class CalibrateResponseMapper(CalibrateQAMapper):
+    """
+    Mapper to calibrate response in question-answer pairs based on reference text.
+    """   # noqa: E501
+
+    DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【回答】进行校准，\
+        使其更加详细、准确，且仍可以回答原问题。只输出校准后的回答，不要输出多余内容。'
+
+    def parse_output(self, raw_output):
+        return None, raw_output.strip()