Skip to content

Commit 12ee557

Browse files
feat: filter non-exist content
1 parent 90f3a72 commit 12ee557

File tree

3 files changed

+47
-1
lines changed

3 files changed

+47
-1
lines changed

graphgen/bases/base_reader.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import os
12
from abc import ABC, abstractmethod
23
from typing import Any, Dict, List
34

5+
import requests
6+
47

58
class BaseReader(ABC):
69
"""
@@ -18,3 +21,45 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
1821
:param file_path: Path to the input file.
1922
:return: List of dictionaries containing the data.
2023
"""
24+
25+
@staticmethod
26+
def filter(data: List[dict]) -> List[dict]:
27+
"""
28+
Filter out entries with empty or missing text in the specified column.
29+
30+
:param data: List of dictionaries containing the data.
31+
:return: Filtered list of dictionaries.
32+
"""
33+
34+
def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
35+
"""
36+
Check if an image exists at the given local path or URL.
37+
:param path_or_url: Local file path or remote URL of the image.
38+
:param timeout: Timeout for remote URL requests in seconds.
39+
:return: True if the image exists, False otherwise.
40+
"""
41+
if not path_or_url:
42+
return False
43+
44+
if not path_or_url.startswith(("http://", "https://", "ftp://")):
45+
path = path_or_url.replace("file://", "", 1)
46+
return os.path.isfile(path)
47+
try:
48+
resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
49+
return resp.status_code == 200
50+
except requests.RequestException:
51+
return False
52+
53+
filtered_data = []
54+
for item in data:
55+
if item.get("type") == "text":
56+
content = item.get("content", "").strip()
57+
if content:
58+
filtered_data.append(item)
59+
elif item.get("type") in ("image", "table", "equation"):
60+
img_path = item.get("img_path")
61+
if _image_exists(img_path):
62+
filtered_data.append(item)
63+
else:
64+
filtered_data.append(item)
65+
return filtered_data

graphgen/configs/vqa_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
read:
2-
input_file: resources/input_examples/pdf_demo.pdf # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
2+
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
33
split:
44
chunk_size: 1024 # chunk size for text splitting
55
chunk_overlap: 100 # chunk overlap for text splitting

graphgen/graphgen.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ async def insert(self, read_config: Dict, split_config: Dict):
9292
"""
9393
# Step 1: Read files
9494
data = read_files(read_config["input_file"], self.working_dir)
95+
print(data)
9596
if len(data) == 0:
9697
logger.warning("No data to process")
9798
return

0 commit comments

Comments
 (0)