Add general fused op (#626)

HYLcool · yxdyc · web-flow · commit e90a759c285b · 2025-04-11T17:46:06.000+08:00
* + add general_fused_op

* + add parameter descriptions for two ffmpeg wrapper ops in config_all.yaml

* - remove lazy_loading for bs4

* Update op_fusion.py

* * run pre-commit

* * run pre-commit

---------

Co-authored-by: Daoyuan Chen &lt;67475544+yxdyc@users.noreply.github.com&gt;
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -77,11 +77,16 @@ hpo_config: null                                            # path to a configur
 # process schedule: a list of several process operators with their arguments
 process:
   # Mapper ops. Most of these ops need no arguments.
-  - audio_add_gaussian_noise_mapper:                         # Mapper to add Gaussian noise to audio.
-      min_amplitude: 0.001                                     # Default: 0.001. Minimum noise amplification factor.
-      max_amplitude: 0.015                                     # Default: 0.015. Maximum noise amplification factor.
-      p: 0.5                                                   # Default: 0.5.(range: [ 0.0, 1.0 ].) The probability of applying this transform.
+  - audio_add_gaussian_noise_mapper:                        # Mapper to add Gaussian noise to audio.
+      min_amplitude: 0.001                                    # Default: 0.001. Minimum noise amplification factor.
+      max_amplitude: 0.015                                    # Default: 0.015. Maximum noise amplification factor.
+      p: 0.5                                                  # Default: 0.5.(range: [ 0.0, 1.0 ].) The probability of applying this transform.
   - audio_ffmpeg_wrapped_mapper:                            # simple wrapper for FFmpeg audio filters
+      filter_name: null                                       # ffmpeg audio filter name. e.g. 'atrim'.
+      filter_kwargs: null                                     # keyword-arguments passed to ffmpeg filter. e.g. {'end': 6}.
+      global_args: null                                       # list-arguments passed to ffmpeg command-line. e.g. ['-progress'].
+      capture_stderr: true                                    # whether to capture stderr.
+      overwrite_output: true                                  # whether to overwrite the output file.
   - calibrate_qa_mapper:                                    # calibrate question-answer pairs based on reference text.
       api_model: 'gpt-4o'                                     # API model name.
       api_endpoint: null                                      # URL endpoint for the API.
@@ -543,6 +548,11 @@ process:
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
       radius: 2                                               # radius of blur kernel
   - video_ffmpeg_wrapped_mapper:                            # simple wrapper for FFmpeg video filters
+      filter_name: null                                       # ffmpeg audio filter name. e.g. 'scale'.
+      filter_kwargs: null                                     # keyword-arguments passed to ffmpeg filter. e.g. {'width': 224, 'height': 224}.
+      global_args: null                                       # list-arguments passed to ffmpeg command-line. e.g. ['-progress'].
+      capture_stderr: true                                    # whether to capture stderr.
+      overwrite_output: true                                  # whether to overwrite the output file.
   - video_remove_watermark_mapper:                          # Remove the watermarks in videos given regions
       roi_strings: ['0,0,0.1,0.1']                            # a given list of regions the watermarks locate. The format of each can be "x1, y1, x2, y2", "(x1, y1, x2, y2)", or "[x1, y1, x2, y2]".
       roi_type: ratio                                         # the roi string type. When the type is 'pixel', (x1, y1), (x2, y2) are the locations of pixels in the top left corner and the bottom right corner respectively. If the roi_type is 'ratio', the coordinates are normalized by widths and heights.
diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
@@ -241,9 +241,9 @@ def run(self, dataset):
         if not isinstance(dataset, NestedDataset):
             dataset = NestedDataset(dataset)
         # add meta field for OPs that produce tags
+        from data_juicer.core.data import add_same_content_to_new_column
         if self._name in TAGGING_OPS.modules \
                 and Fields.meta not in dataset.features:
-            from data_juicer.core.data import add_same_content_to_new_column
             dataset = dataset.map(add_same_content_to_new_column,
                                   fn_kwargs={
                                       'new_column_name': Fields.meta,
@@ -252,7 +252,20 @@ def run(self, dataset):
                                   num_proc=self.runtime_np(),
                                   batch_size=self.batch_size,
                                   desc='Adding new column for meta')
-        if self.index_key is not None:
+        # add stats field for Filters that produce stats
+        if isinstance(self, Filter) \
+                and self._name not in NON_STATS_FILTERS.modules \
+                and Fields.stats not in dataset.features:
+            dataset = dataset.map(add_same_content_to_new_column,
+                                  fn_kwargs={
+                                      'new_column_name': Fields.stats,
+                                      'initial_value': {}
+                                  },
+                                  num_proc=self.runtime_np(),
+                                  batch_size=self.batch_size,
+                                  desc='Adding new column for stats')
+        if self.index_key is not None \
+                and self.index_key not in dataset.features:
 
             def add_index(sample, idx):
                 sample[self.index_key] = idx
@@ -455,18 +468,6 @@ def process_single(self, sample):
 
     def run(self, dataset, *, exporter=None, tracer=None, reduce=True):
         dataset = super(Filter, self).run(dataset)
-        # add stats field for Filters that produce stats
-        if self._name not in NON_STATS_FILTERS.modules \
-                and Fields.stats not in dataset.features:
-            from data_juicer.core.data import add_same_content_to_new_column
-            dataset = dataset.map(add_same_content_to_new_column,
-                                  fn_kwargs={
-                                      'new_column_name': Fields.stats,
-                                      'initial_value': {}
-                                  },
-                                  num_proc=self.runtime_np(),
-                                  batch_size=self.batch_size,
-                                  desc='Adding new column for stats')
         dataset = dataset.map(self.compute_stats,
                               num_proc=self.runtime_np(),
                               with_rank=self.use_cuda(),
diff --git a/data_juicer/ops/mapper/extract_tables_from_html_mapper.py b/data_juicer/ops/mapper/extract_tables_from_html_mapper.py
@@ -1,10 +1,9 @@
+import bs4
+
 from data_juicer.utils.constant import Fields, MetaKeys
-from data_juicer.utils.lazy_loader import LazyLoader
 
 from ..base_op import OPERATORS, TAGGING_OPS, Mapper
 
-bs4 = LazyLoader('bs4', 'bs4')
-
 OP_NAME = 'extract_tables_from_html_mapper'
 
 
diff --git a/data_juicer/ops/op_fusion.py b/data_juicer/ops/op_fusion.py
@@ -3,11 +3,11 @@
 import numpy as np
 from loguru import logger
 
+from data_juicer.ops.base_op import OP, OPERATORS, Filter, Mapper
+from data_juicer.ops.load import load_ops
 from data_juicer.utils.constant import Fields, InterVars
 from data_juicer.utils.registry import Registry
 
-from .base_op import Filter
-
 # Type of intermediate vars
 # text
 INTER_LINES = Registry(InterVars.lines)
@@ -196,3 +196,75 @@ def process_batched(self, samples):
             else:
                 res = this_res
         return res
+
+
+@OPERATORS.register_module('general_fused_op')
+class GeneralFusedOP(OP):
+    """An explicitly fused operator designed to execute multiple sequential
+    operations (OPs) on the same batch, enabling fine-grained control over
+    data processing."""
+
+    _batched_op = True
+
+    def __init__(self,
+                 batch_size: int = 1,
+                 fused_op_list: List = None,
+                 *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.batch_size = batch_size
+        if fused_op_list is None:
+            fused_op_list = []
+        self.fused_ops = load_ops(fused_op_list)
+        self._name = 'GeneralFusedOP:(%s)' % ','.join(
+            [op._name for op in self.fused_ops])
+        # set accelerator to 'cuda' if there exists any ops whose accelerator
+        # is 'cuda'
+        accelerator_methods = set([op.accelerator for op in self.fused_ops])
+        if 'cuda' in accelerator_methods:
+            self.accelerator = 'cuda'
+
+        # update num_proc with the min num_proc of all fusible filters
+        self.num_proc = min([op.runtime_np() for op in self.fused_ops]) \
+            if self.fused_ops else 1
+
+    def process_batched(self, samples, rank=None):
+        for op in self.fused_ops:
+            process_args = {'rank': rank} if op.accelerator == 'cuda' else {}
+            if isinstance(op, Mapper):
+                samples = op.process_batched(samples, **process_args)
+            elif isinstance(op, Filter):
+                samples = op.compute_stats_batched(samples, **process_args)
+                indicators = list(op.process_batched(samples))
+                new_samples = {}
+                for key in samples:
+                    new_samples[key] = [
+                        val for val, indicator in zip(samples[key], indicators)
+                        if indicator
+                    ]
+                samples = new_samples
+            else:
+                raise NotImplementedError(
+                    f'FusedOP does not support OP {op._name} of type '
+                    f'{type(op)} and only supports Mapper and Filter now.')
+        return samples
+
+    def run(self, dataset, *, exporter=None, tracer=None):
+        # prepare the dataset
+        from data_juicer.core.data import NestedDataset
+        if not isinstance(dataset, NestedDataset):
+            dataset = NestedDataset(dataset)
+        if not self.fused_ops:
+            return dataset
+        # initialize for different kinds of datasets
+        for op in self.fused_ops:
+            dataset = OP.run(op, dataset)
+
+        new_dataset = dataset.map(
+            self.process_batched,
+            num_proc=self.num_proc,
+            with_rank=self.use_cuda(),
+            batch_size=self.batch_size,
+            desc=self._name + '_process',
+        )
+        return new_dataset
diff --git a/tests/ops/test_op_fusion.py b/tests/ops/test_op_fusion.py
@@ -1,12 +1,27 @@
 import unittest
 
+from data_juicer.core import NestedDataset
+from data_juicer.ops.base_op import OP
 from data_juicer.ops.load import load_ops
-from data_juicer.ops.op_fusion import fuse_operators
+from data_juicer.ops.op_fusion import fuse_operators, GeneralFusedOP
 from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 
 class OpFusionTest(DataJuicerTestCaseBase):
 
+    def _run_equal_config(self, original_process_list):
+        dataset = NestedDataset.from_list([
+            {'text': 'This is a test.'},
+            {'text': 'This is a test. This is a test. This is a test.'},
+            {'text': 'aaaaaaaaaaaaaaabbbbbbbbbbbbcccccccccccccc'},
+            {'text': 'punc test。'}
+        ])
+        unfused_op = load_ops(original_process_list)
+        fused_ops = fuse_operators(unfused_op)
+        res1 = dataset.process(fused_ops)
+        res2 = dataset.process(unfused_op)
+        self.assertDatasetEqual(res1, res2)
+
     def _run_op_fusion(self, original_process_list, target_process_list, probe_res=None):
         ops = load_ops(original_process_list)
         ops = fuse_operators(ops, probe_res)
@@ -232,6 +247,7 @@ def test_regular_config(self):
             }
         ]
         self._run_op_fusion(original_process, target_process)
+        self._run_equal_config(original_process)
 
     def test_only_mapper(self):
         original_process = [{
@@ -1961,5 +1977,125 @@ def test_different_intermediate_vars_with_probe_res(self):
         self._run_op_fusion(original_process, target_process, probe_res_list)
 
 
+class GeneralFusedOPTest(DataJuicerTestCaseBase):
+
+    def setUp(self) -> None:
+        self.dataset = NestedDataset.from_list([
+            {'text': 'This is a test.'},
+            {'text': 'This is a test. This is a test. This is a test.'},
+            {'text': 'aaaaaaaaaaaaaaabbbbbbbbbbbbcccccccccccccc'},
+            {'text': 'punc test。'}
+        ])
+
+    def _run_equal_config(self, fused_process, unfused_process):
+        fused_op = load_ops(fused_process)
+        self.assertEqual(len(fused_op), 1)
+        fused_op = fused_op[0]
+        unfused_op = load_ops(unfused_process)
+        self.assertIsInstance(fused_op, GeneralFusedOP)
+        self.assertEqual(len(fused_op.fused_ops), len(unfused_process))
+        res1 = self.dataset.process(fused_op)
+        res2 = self.dataset.process(unfused_op)
+        # invoke process_batched directly
+        for op in fused_op.fused_ops:
+            self.dataset = OP.run(op, self.dataset)
+        res3 = fused_op.process_batched(self.dataset.to_dict())
+        self.assertDatasetEqual(res1, res2)
+        self.assertEqual(res1.to_dict(), res3)
+
+    def test_regular_config(self):
+
+        original_process = [{
+            'language_id_score_filter': {
+                'lang': 'en',
+                'min_score': 0.8,
+                'text_key': 'text'
+            }
+        }, {
+            'whitespace_normalization_mapper': {
+                'text_key': 'text'
+            }
+        }, {
+            'punctuation_normalization_mapper': {
+                'text_key': 'text'
+            }
+        }, {
+            'fix_unicode_mapper': {
+                'text_key': 'text'
+            }
+        }, {
+            'character_repetition_filter': {
+                'max_ratio': 0.106,
+                'min_ratio': 0.0,
+                'rep_len': 10,
+                'text_key': 'text'
+            }
+        }]
+        fused_process = [{
+            'general_fused_op': {
+                'batch_size': 2,
+                'fused_op_list': original_process,
+            }
+        }]
+        self._run_equal_config(fused_process, original_process)
+
+    def test_border_cases(self):
+
+        original_process = [{
+            'language_id_score_filter': {
+                'lang': 'en',
+                'min_score': 0.8,
+                'text_key': 'text'
+            }
+        }, {
+            'whitespace_normalization_mapper': {
+                'text_key': 'text'
+            }
+        }, {
+            'punctuation_normalization_mapper': {
+                'text_key': 'text'
+            }
+        }, {
+            'fix_unicode_mapper': {
+                'text_key': 'text'
+            }
+        }, {
+            'character_repetition_filter': {
+                'max_ratio': 0.106,
+                'min_ratio': 0.0,
+                'rep_len': 10,
+                'text_key': 'text'
+            }
+        }]
+        empty_fused_process = [{
+            'general_fused_op': {
+                'batch_size': 2,
+                'fused_op_list': None,
+            }
+        }]
+        fused_process = [{
+            'general_fused_op': {
+                'batch_size': 2,
+                'fused_op_list': original_process,
+            }
+        }]
+        # empty fused process
+        fused_op = load_ops(empty_fused_process)[0]
+        self.assertEqual(len(fused_op.fused_ops), 0)
+        res = fused_op.run(self.dataset)
+        self.assertDatasetEqual(res, self.dataset)
+        # unsupported fused op
+        with self.assertRaises(NotImplementedError):
+            fused_op = load_ops([{
+                'general_fused_op': {
+                    'batch_size': 2,
+                    'fused_op_list': [{
+                        'document_deduplicator': {}
+                    }],
+                }
+            }])[0]
+            fused_op.process_batched(self.dataset.to_dict())
+
+
 if __name__ == '__main__':
     unittest.main()