Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
import copy
import os
import traceback
from functools import wraps

import pyarrow as pa
from loguru import logger

from data_juicer import is_cuda_available
from data_juicer.utils.auto_install_utils import AutoInstaller
from data_juicer.utils.constant import Fields
from data_juicer.utils.mm_utils import size_to_bytes
from data_juicer.utils.process_utils import calculate_np
from data_juicer.utils.registry import Registry

OPERATORS = Registry('Operators')
UNFORKABLE = Registry('Unforkable')
current_path = os.path.dirname(os.path.realpath(__file__))
version_file_path = os.path.join(current_path,
'../../environments/science_requires.txt')
AUTOINSTALL = AutoInstaller([version_file_path])


def convert_list_dict_to_dict_list(samples):
Expand Down
12 changes: 6 additions & 6 deletions data_juicer/ops/deduplicator/document_minhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,16 @@
from loguru import logger
from tqdm import tqdm

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import prepare_sentencepiece_model

from ..base_op import OPERATORS, Deduplicator
from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..common.helper_func import UnionFind, split_on_whitespace

OP_NAME = 'document_minhash_deduplicator'

with AvailabilityChecking(['scipy'], OP_NAME):
from scipy.integrate import quad as integrate
integrate = LazyLoader('integrate', globals(), 'scipy.integrate')

MERSENNE_PRIME = np.uint64((1 << 61) - 1)
MAX_HASH = np.uint64((1 << 32) - 1)
Expand Down Expand Up @@ -68,7 +67,7 @@ def false_positive_probability(th: float, band: int, rows: int):
def proba(s):
return 1 - (1 - s**float(rows))**float(band)

a, _ = integrate(proba, 0.0, th)
a, _ = integrate.quad(proba, 0.0, th)
return a

def false_negative_probability(th: float, band: int, rows: int):
Expand All @@ -77,7 +76,7 @@ def false_negative_probability(th: float, band: int, rows: int):
def proba(s):
return 1 - (1 - (1 - s**float(rows))**float(band))

a, _ = integrate(proba, th, 1.0)
a, _ = integrate.quad(proba, th, 1.0)
return a

# object: minimize the weighted FP and FN ratio
Expand Down Expand Up @@ -150,6 +149,7 @@ def __init__(
sentencepiece tokenization.
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['scipy'])
# about minhash computation
self.tokenization = tokenization
self.window_size = window_size
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/ops/deduplicator/document_simhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,15 @@
from jsonargparse.typing import PositiveInt
from loguru import logger

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
from data_juicer.utils.lazy_loader import LazyLoader

from ..base_op import OPERATORS, Deduplicator
from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..common.helper_func import split_on_whitespace

OP_NAME = 'document_simhash_deduplicator'

with AvailabilityChecking(['simhash-pybind'], OP_NAME):
import simhash
simhash = LazyLoader('simhash', globals(), 'simhash')


@OPERATORS.register_module(OP_NAME)
Expand Down Expand Up @@ -57,6 +56,7 @@ def __init__(self,
"""
# about simhash computation
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['simhash-pybind'])
self.tokenization = tokenization
self.window_size = window_size
self.lowercase = lowercase
Expand Down
24 changes: 10 additions & 14 deletions data_juicer/ops/deduplicator/image_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,27 @@

import numpy as np

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ..base_op import OPERATORS, Deduplicator
from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..op_fusion import LOADED_IMAGES
from .document_deduplicator import DocumentDeduplicator

OP_NAME = 'image_deduplicator'

with AvailabilityChecking(['imagededup'], OP_NAME):
import imagededup # noqa: F401
imagededup = LazyLoader('imagededup', globals(), 'imagededup')

HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}
HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}

def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash

mapping = {
'phash': PHash,
'dhash': DHash,
'whash': WHash,
'ahash': AHash
}
def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash

return mapping[method_name]
mapping = {'phash': PHash, 'dhash': DHash, 'whash': WHash, 'ahash': AHash}

return mapping[method_name]


@OPERATORS.register_module(OP_NAME)
Expand All @@ -54,6 +49,7 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['imagededup'])
if method not in HASH_METHOD:
raise ValueError(f'Keep strategy [{method}] is not supported. '
f'Can only be one of {HASH_METHOD}.')
Expand Down
24 changes: 10 additions & 14 deletions data_juicer/ops/deduplicator/ray_image_deduplicator.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,26 @@
import numpy as np
from jsonargparse.typing import PositiveInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ..base_op import OPERATORS
from ..base_op import AUTOINSTALL, OPERATORS
from ..op_fusion import LOADED_IMAGES
from .ray_basic_deduplicator import RayBasicDeduplicator

OP_NAME = 'ray_image_deduplicator'

with AvailabilityChecking(['imagededup'], OP_NAME):
import imagededup # noqa: F401
imagededup = LazyLoader('imagededup', globals(), 'imagededup')

HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}
HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}

def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash

mapping = {
'phash': PHash,
'dhash': DHash,
'whash': WHash,
'ahash': AHash
}
def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash

return mapping[method_name]
mapping = {'phash': PHash, 'dhash': DHash, 'whash': WHash, 'ahash': AHash}

return mapping[method_name]


@OPERATORS.register_module(OP_NAME)
Expand Down Expand Up @@ -53,6 +48,7 @@ def __init__(self,
redis_port=redis_port,
*args,
**kwargs)
AUTOINSTALL.check(['imagededup'])
if method not in HASH_METHOD:
raise ValueError(f'Keep strategy [{method}] is not supported. '
f'Can only be one of {HASH_METHOD}.')
Expand Down
7 changes: 2 additions & 5 deletions data_juicer/ops/filter/alphanumeric_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,14 @@

from jsonargparse.typing import PositiveFloat

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..common import get_words_from_document

OP_NAME = 'alphanumeric_filter'

with AvailabilityChecking(['transformers'], OP_NAME):
import transformers # noqa: F401


@OPERATORS.register_module('alphanumeric_filter')
class AlphanumericFilter(Filter):
Expand Down Expand Up @@ -43,6 +39,7 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['transformers'])
self.tokenization = tokenization
self.min_ratio = min_ratio
self.max_ratio = max_ratio
Expand Down
7 changes: 2 additions & 5 deletions data_juicer/ops/filter/flagged_words_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,17 @@

from jsonargparse.typing import ClosedUnitInterval, List

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, InterVars, StatsKeys
from data_juicer.utils.model_utils import get_model, prepare_model

from ...utils.asset_utils import ASSET_DIR, load_words_asset
from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
words_refinement)
from ..op_fusion import INTER_WORDS

OP_NAME = 'flagged_words_filter'

with AvailabilityChecking(['sentencepiece'], OP_NAME):
import sentencepiece # noqa: F401


@OPERATORS.register_module(OP_NAME)
@INTER_WORDS.register_module(OP_NAME)
Expand Down Expand Up @@ -56,6 +52,7 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['sentencepiece'])
self.lang = lang
self.max_ratio = max_ratio
self.use_words_aug = use_words_aug
Expand Down
15 changes: 5 additions & 10 deletions data_juicer/ops/filter/image_aesthetics_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,18 @@
from jsonargparse.typing import ClosedUnitInterval
from loguru import logger

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ...utils.model_utils import get_model, prepare_model
from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_aesthetics_filter'
CHECK_PKGs = ['torch', 'transformers', 'simple-aesthetics-predictor']

with AvailabilityChecking(CHECK_PKGs, OP_NAME):

import aesthetics_predictor # noqa: F401
import torch
import transformers # noqa: F401

# avoid hanging when calling clip in multiprocessing
torch.set_num_threads(1)
torch = LazyLoader('torch', globals(), 'torch')


@OPERATORS.register_module(OP_NAME)
Expand Down Expand Up @@ -57,6 +50,8 @@ def __init__(self,
"""

super().__init__(*args, **kwargs)
AUTOINSTALL.check(
['torch', 'transformers', 'simple-aesthetics-predictor'])
if hf_scorer_model == '':
hf_scorer_model = \
'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE'
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/ops/filter/image_face_ratio_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,18 @@
from jsonargparse.typing import ClosedUnitInterval
from loguru import logger

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context,
load_image)
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, UNFORKABLE, Filter
from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_face_ratio_filter'

with AvailabilityChecking(['opencv-python'], OP_NAME):
import cv2
cv2 = LazyLoader('cv2', globals(), 'cv2')


@UNFORKABLE.register_module(OP_NAME)
Expand Down Expand Up @@ -55,6 +54,7 @@ def __init__(self,
:param kwargs: Extra keyword arguments.
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['opencv-python'])

if cv_classifier == '':
cv_classifier = os.path.join(cv2.data.haarcascades,
Expand Down
13 changes: 5 additions & 8 deletions data_juicer/ops/filter/image_nsfw_filter.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,18 @@
import numpy as np
from jsonargparse.typing import ClosedUnitInterval

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_nsfw_filter'

with AvailabilityChecking(['torch', 'transformers'], OP_NAME):
import torch
import transformers # noqa: F401

# avoid hanging when calling nsfw detection in multiprocessing
torch.set_num_threads(1)
torch = LazyLoader('torch', globals(), 'torch')
transformers = LazyLoader('transformers', globals(), 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down Expand Up @@ -48,6 +44,7 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['torch', 'transformers'])
self.score_threshold = score_threshold
if any_or_all not in ['any', 'all']:
raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
Expand Down
13 changes: 5 additions & 8 deletions data_juicer/ops/filter/image_text_matching_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,19 @@
from jsonargparse.typing import ClosedUnitInterval
from PIL import ImageOps

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context,
load_image, remove_special_tokens)
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_text_matching_filter'

with AvailabilityChecking(['torch', 'transformers'], OP_NAME):
import torch
import transformers # noqa: F401

# avoid hanging when calling blip in multiprocessing
torch.set_num_threads(1)
torch = LazyLoader('torch', globals(), 'torch')
transformers = LazyLoader('transformers', globals(), 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down Expand Up @@ -62,6 +58,7 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['torch', 'transformers'])
self.min_score = min_score
self.max_score = max_score
if reduce_mode not in ['avg', 'max', 'min']:
Expand Down
Loading
Loading