1
1
"""
2
2
ParseNode Module
3
3
"""
4
- from typing import Tuple , List , Optional
5
- from urllib .parse import urljoin
4
+ from typing import List , Optional
6
5
from semchunk import chunk
7
6
from langchain_community .document_transformers import Html2TextTransformer
8
7
from langchain_core .documents import Document
9
8
from .base_node import BaseNode
10
- from ..helpers import default_filters
11
-
12
- import re
13
9
14
10
class ParseNode (BaseNode ):
15
11
"""
@@ -44,67 +40,6 @@ def __init__(
44
40
self .parse_html = (
45
41
True if node_config is None else node_config .get ("parse_html" , True )
46
42
)
47
- self .llm_model = node_config ['llm_model' ]
48
- self .parse_urls = (
49
- False if node_config is None else node_config .get ("parse_urls" , False )
50
- )
51
-
52
- def _clean_urls (self , urls : List [str ]) -> List [str ]:
53
- """
54
- Cleans the URLs extracted from the text.
55
-
56
- Args:
57
- urls (List[str]): The list of URLs to clean.
58
-
59
- Returns:
60
- List[str]: The cleaned URLs.
61
- """
62
- cleaned_urls = []
63
- for url in urls :
64
- # Remove any leading 'thumbnail](' or similar patterns
65
- url = re .sub (r'.*?\]\(' , '' , url )
66
-
67
- # Remove any trailing parentheses or brackets
68
- url = url .rstrip (').' )
69
-
70
- cleaned_urls .append (url )
71
-
72
- return cleaned_urls
73
-
74
- def extract_urls (self , text : str , source : str ) -> Tuple [List [str ], List [str ]]:
75
- """
76
- Extracts URLs from the given text.
77
-
78
- Args:
79
- text (str): The text to extract URLs from.
80
-
81
- Returns:
82
- Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
83
- """
84
- # Return empty lists if the URLs are not to be parsed
85
- if not self .parse_urls :
86
- return [], []
87
-
88
- # Regular expression to find URLs (both links and images)
89
- image_extensions = default_filters .filter_dict ["img_exts" ]
90
- image_extension_seq = '|' .join (image_extensions ).replace ('.' ,'' )
91
- url_pattern = re .compile (r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))' )
92
-
93
- # Find all URLs in the string
94
- all_urls = url_pattern .findall (text )
95
- all_urls = self ._clean_urls (all_urls )
96
-
97
- if not source .startswith ("http" ):
98
- # Remove any URLs that is not complete
99
- all_urls = [url for url in all_urls if url .startswith ("http" )]
100
- else :
101
- # Add to local URLs the source URL
102
- all_urls = [urljoin (source , url ) for url in all_urls ]
103
-
104
- images = [url for url in all_urls if any (url .endswith (ext ) for ext in image_extensions )]
105
- links = [url for url in all_urls if url not in images ]
106
-
107
- return links , images
108
43
109
44
def execute (self , state : dict ) -> dict :
110
45
"""
@@ -127,46 +62,33 @@ def execute(self, state: dict) -> dict:
127
62
input_keys = self .get_input_keys (state )
128
63
129
64
input_data = [state [key ] for key in input_keys ]
130
-
131
65
docs_transformed = input_data [0 ]
132
- source = input_data [1 ] if self .parse_urls else None
133
-
134
- def count_tokens (text ):
135
- from ..utils import token_count
136
- return token_count (text , self .llm_model .model_name )
137
66
138
67
if self .parse_html :
139
68
docs_transformed = Html2TextTransformer ().transform_documents (input_data [0 ])
140
69
docs_transformed = docs_transformed [0 ]
141
70
142
- link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
143
-
144
71
chunks = chunk (text = docs_transformed .page_content ,
145
72
chunk_size = self .node_config .get ("chunk_size" , 4096 )- 250 ,
146
- token_counter = count_tokens ,
73
+ token_counter = lambda text : len ( text . split ()) ,
147
74
memoize = False )
148
75
else :
149
76
docs_transformed = docs_transformed [0 ]
150
77
151
- link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
152
-
153
78
chunk_size = self .node_config .get ("chunk_size" , 4096 )
154
79
chunk_size = min (chunk_size - 500 , int (chunk_size * 0.9 ))
155
80
156
81
if isinstance (docs_transformed , Document ):
157
82
chunks = chunk (text = docs_transformed .page_content ,
158
83
chunk_size = chunk_size ,
159
- token_counter = count_tokens ,
84
+ token_counter = lambda text : len ( text . split ()) ,
160
85
memoize = False )
161
86
else :
162
87
chunks = chunk (text = docs_transformed ,
163
88
chunk_size = chunk_size ,
164
- token_counter = count_tokens ,
89
+ token_counter = lambda text : len ( text . split ()) ,
165
90
memoize = False )
166
91
167
92
state .update ({self .output [0 ]: chunks })
168
- if self .parse_urls :
169
- state .update ({self .output [1 ]: link_urls })
170
- state .update ({self .output [2 ]: img_urls })
171
93
172
94
return state
0 commit comments