1+ import  os 
12from  abc  import  ABC , abstractmethod 
23from  typing  import  Any , Dict , List 
34
5+ import  requests 
6+ 
47
58class  BaseReader (ABC ):
69    """ 
@@ -18,3 +21,45 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
1821        :param file_path: Path to the input file. 
1922        :return: List of dictionaries containing the data. 
2023        """ 
24+ 
25+     @staticmethod  
26+     def  filter (data : List [dict ]) ->  List [dict ]:
27+         """ 
28+         Filter out entries with empty or missing text in the specified column. 
29+ 
30+         :param data: List of dictionaries containing the data. 
31+         :return: Filtered list of dictionaries. 
32+         """ 
33+ 
34+         def  _image_exists (path_or_url : str , timeout : int  =  3 ) ->  bool :
35+             """ 
36+             Check if an image exists at the given local path or URL. 
37+             :param path_or_url: Local file path or remote URL of the image. 
38+             :param timeout: Timeout for remote URL requests in seconds. 
39+             :return: True if the image exists, False otherwise. 
40+             """ 
41+             if  not  path_or_url :
42+                 return  False 
43+ 
44+             if  not  path_or_url .startswith (("http://" , "https://" , "ftp://" )):
45+                 path  =  path_or_url .replace ("file://" , "" , 1 )
46+                 return  os .path .isfile (path )
47+             try :
48+                 resp  =  requests .head (path_or_url , allow_redirects = True , timeout = timeout )
49+                 return  resp .status_code  ==  200 
50+             except  requests .RequestException :
51+                 return  False 
52+ 
53+         filtered_data  =  []
54+         for  item  in  data :
55+             if  item .get ("type" ) ==  "text" :
56+                 content  =  item .get ("content" , "" ).strip ()
57+                 if  content :
58+                     filtered_data .append (item )
59+             elif  item .get ("type" ) in  ("image" , "table" , "equation" ):
60+                 img_path  =  item .get ("img_path" )
61+                 if  _image_exists (img_path ):
62+                     filtered_data .append (item )
63+             else :
64+                 filtered_data .append (item )
65+         return  filtered_data 
0 commit comments