|
| 1 | +project_name: 'llava-1.5-pretrain-dataset-refine-recipe' |
| 2 | +dataset_path: 'blip_laion_cc_sbu_558k_dj_fmt_only_caption.jsonl' # converted LLaVA pretrain dataset in Data-Juicer format with only_keep_caption is True. See tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py |
| 3 | +export_path: 'blip_laion_cc_sbu_558k_dj_fmt_only_caption_refined.jsonl' |
| 4 | + |
| 5 | +np: 42 # number of subprocess to process your dataset |
| 6 | +text_keys: 'text' # the key name of field where the sample texts to be processed, e.g., `text`, `instruction`, `output`, ... |
| 7 | + |
| 8 | +# for multimodal data processing |
| 9 | +image_key: 'images' # Key name of field to store the list of sample image paths. |
| 10 | +image_special_token: '<image>' # The special token that represents an image in the text. For LLaVA, it's "<image>". Should be aligned with the args when running conversion tools. |
| 11 | +eoc_special_token: '<|__dj__eoc|>' # The special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset. Should be aligned with the args when running conversion tools. |
| 12 | + |
| 13 | +open_tracer: true |
| 14 | + |
| 15 | +# process schedule: a list of several process operators with their arguments |
| 16 | +process: |
| 17 | + - fix_unicode_mapper: # fix unicode errors in text. |
| 18 | + - punctuation_normalization_mapper: # normalize unicode punctuations to English punctuations. |
| 19 | + |
| 20 | + # 558128 |
| 21 | + # Filter ops |
| 22 | + - alphanumeric_filter: #558087 # filter text with alphabet/numeric ratio out of specific range. |
| 23 | + tokenization: false # Whether to count the ratio of alphanumeric to the total number of tokens. |
| 24 | + min_ratio: 0.60 # the min ratio of filter range |
| 25 | + - character_repetition_filter: #546105 # filter text with the character repetition ratio out of specific range |
| 26 | + rep_len: 10 # repetition length for char-level n-gram |
| 27 | + max_ratio: 0.09373663 # the max ratio of filter range |
| 28 | + - flagged_words_filter: #543960 # filter text with the flagged-word ratio larger than a specific max value |
| 29 | + lang: en # consider flagged words in what language |
| 30 | + tokenization: false # whether to use model to tokenize documents |
| 31 | + max_ratio: 0.0 # the max ratio to filter text |
| 32 | + - perplexity_filter: #532029 # filter text with perplexity score out of specific range |
| 33 | + lang: en # compute perplexity in what language |
| 34 | + max_ppl: 14435.5806 # the max perplexity score to filter text |
| 35 | + - special_characters_filter: #531968 # filter text with special-char ratio out of specific range |
| 36 | + min_ratio: 0.16534802 # the min ratio of filter range |
| 37 | + max_ratio: 0.42023757 # the max ratio of filter range |
| 38 | + - word_repetition_filter: # 530773 # filter text with the word repetition ratio out of specific range |
| 39 | + lang: en # sample in which language |
| 40 | + tokenization: false # whether to use model to tokenize documents |
| 41 | + rep_len: 10 # repetition length for word-level n-gram |
| 42 | + max_ratio: 0.03085751 # the max ratio of filter range |
| 43 | + |
| 44 | + - image_aspect_ratio_filter: #542389 # filter samples according to the aspect ratios of images (a fraction of width by height, r=w/h) in them |
| 45 | + min_ratio: 0.333 # the min aspect ratio of filter range |
| 46 | + max_ratio: 3.0 # the max aspect ratio of filter range |
| 47 | + any_or_all: any # keep this sample when any/all images meet the filter condition |
| 48 | + - image_shape_filter: #533966 # filter samples according to the widths and heights of images in them |
| 49 | + max_width: 727.8798422276 # the max width of width filter range |
| 50 | + max_height: 606.2421072264 # the max height of height filter range |
| 51 | + any_or_all: any # keep this sample when any/all images meet the filter condition |
| 52 | + - image_size_filter: # 533966 # filter samples according to the size of images (in bytes) within them |
| 53 | + max_size: "124KB" # the max size of filter range |
| 54 | + any_or_all: any # keep this sample when any/all images meet the filter condition |
| 55 | + - image_text_similarity_filter: #544202 # filter samples according to the similarity between text and images. |
| 56 | + hf_clip: openai/clip-vit-base-patch32 # name of used Hugging Face clip |
| 57 | + min_score: 0.20315419 # the min similarity of filter range |
| 58 | + - image_text_matching_filter: # filter samples according to the matching score between image and text. |
| 59 | + hf_blip: Salesforce/blip-itm-base-coco # name of used Hugging Face blip |
| 60 | + min_score: 0.44930778 # the min matching score of filter range |
0 commit comments