1
+ import os
2
+ import random
3
+ import string
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Optional , Union
7
+ from urllib .parse import urlparse
8
+
9
+ import requests
10
+
1
11
from mindee .error .mindee_error import MindeeSourceError
12
+ from mindee .input .sources .bytes_input import BytesInput
2
13
from mindee .input .sources .local_input_source import InputType
3
14
from mindee .logger import logger
4
15
@@ -13,7 +24,7 @@ def __init__(self, url: str) -> None:
13
24
"""
14
25
Input document from a base64 encoded string.
15
26
16
- :param url: URL to send, must be HTTPS
27
+ :param url: URL to send, must be HTTPS.
17
28
"""
18
29
if not url .lower ().startswith ("https" ):
19
30
raise MindeeSourceError ("URL must be HTTPS" )
@@ -23,3 +34,175 @@ def __init__(self, url: str) -> None:
23
34
logger .debug ("URL input: %s" , url )
24
35
25
36
self .url = url
37
+
38
+ def __fetch_file_content (
39
+ self ,
40
+ username : Optional [str ] = None ,
41
+ password : Optional [str ] = None ,
42
+ token : Optional [str ] = None ,
43
+ headers : Optional [dict ] = None ,
44
+ max_redirects : int = 3 ,
45
+ ) -> bytes :
46
+ """
47
+ Fetch the content of the file from the URL.
48
+
49
+ :param username: Optional username for authentication.
50
+ :param password: Optional password for authentication.
51
+ :param token: Optional token for authentication.
52
+ :param headers: Optional additional headers for the request.
53
+ :param max_redirects: Maximum number of redirects to follow.
54
+ :return: The content of the file as bytes.
55
+ """
56
+ if not headers :
57
+ headers = {}
58
+ if token :
59
+ headers ["Authorization" ] = f"Bearer { token } "
60
+ auth = None if not username or not password else (username , password )
61
+
62
+ response = UrlInputSource .__make_request (
63
+ self .url , auth , headers , 0 , max_redirects = max_redirects
64
+ )
65
+
66
+ return response
67
+
68
+ def save_to_file (
69
+ self ,
70
+ filepath : Union [Path , str ],
71
+ filename : Optional [str ] = None ,
72
+ username : Optional [str ] = None ,
73
+ password : Optional [str ] = None ,
74
+ token : Optional [str ] = None ,
75
+ headers : Optional [dict ] = None ,
76
+ max_redirects : int = 3 ,
77
+ ) -> Path :
78
+ """
79
+ Save the content of the URL to a file.
80
+
81
+ :param filepath: Path to save the content to.
82
+ :param filename: Optional filename to give to the file.
83
+ :param username: Optional username for authentication.
84
+ :param password: Optional password for authentication.
85
+ :param token: Optional token for authentication.
86
+ :param headers: Optional additional headers for the request.
87
+ :param max_redirects: Maximum number of redirects to follow.
88
+ :return: The path to the saved file.
89
+ """
90
+ response = self .__fetch_file_content (
91
+ username , password , token , headers , max_redirects
92
+ )
93
+ filename = self .__fill_filename (filename )
94
+ full_path = Path (filepath ) / filename
95
+ with open (full_path , "wb" ) as binary_file :
96
+ binary_file .write (response )
97
+ return full_path
98
+
99
+ def as_local_input_source (
100
+ self ,
101
+ filename : Optional [str ] = None ,
102
+ username : Optional [str ] = None ,
103
+ password : Optional [str ] = None ,
104
+ token : Optional [str ] = None ,
105
+ headers : Optional [dict ] = None ,
106
+ max_redirects : int = 3 ,
107
+ ) -> BytesInput :
108
+ """
109
+ Convert the URL content to a BytesInput object.
110
+
111
+ :param filename: Optional filename for the BytesInput.
112
+ :param username: Optional username for authentication.
113
+ :param password: Optional password for authentication.
114
+ :param token: Optional token for authentication.
115
+ :param headers: Optional additional headers for the request.
116
+ :param max_redirects: Maximum number of redirects to follow.
117
+ :return: A BytesInput object containing the file content.
118
+ """
119
+ response = self .__fetch_file_content (
120
+ username , password , token , headers , max_redirects
121
+ )
122
+ filename = self .__fill_filename (filename )
123
+
124
+ return BytesInput (response , filename )
125
+
126
+ @staticmethod
127
+ def __extract_filename_from_url (uri ) -> str :
128
+ """
129
+ Extract the filename from a given URL.
130
+
131
+ :param uri: The URL to extract the filename from.
132
+ :return: The extracted filename or an empty string if not found.
133
+ """
134
+ filename = os .path .basename (urlparse (uri ).path )
135
+ return filename if filename else ""
136
+
137
+ @staticmethod
138
+ def __generate_file_name (extension = ".tmp" ) -> str :
139
+ """
140
+ Generate a unique filename with a timestamp and random string.
141
+
142
+ :param extension: The file extension to use (default is '.tmp').
143
+ :return: A generated filename.
144
+ """
145
+ random_string = "" .join (
146
+ random .choices (string .ascii_lowercase + string .digits , k = 8 )
147
+ )
148
+ timestamp = datetime .now ().strftime ("%Y-%m-%d_%H-%M-%S" )
149
+ return f"mindee_temp_{ timestamp } _{ random_string } { extension } "
150
+
151
+ @staticmethod
152
+ def __get_file_extension (filename ) -> Optional [str ]:
153
+ """
154
+ Get the extension from a filename.
155
+
156
+ :param filename: The filename to extract the extension from.
157
+ :return: The lowercase file extension or None if not found.
158
+ """
159
+ ext = os .path .splitext (filename )[1 ]
160
+ return ext .lower () if ext else None
161
+
162
+ def __fill_filename (self , filename = None ) -> str :
163
+ """
164
+ Fill in a filename if not provided or incomplete.
165
+
166
+ :param filename: Optional filename to use.
167
+ :return: A complete filename.
168
+ """
169
+ if filename is None :
170
+ filename = UrlInputSource .__extract_filename_from_url (self .url )
171
+
172
+ if not filename or not os .path .splitext (filename )[1 ]:
173
+ filename = self .__generate_file_name (
174
+ extension = UrlInputSource .__get_file_extension (filename )
175
+ )
176
+
177
+ return filename
178
+
179
+ @staticmethod
180
+ def __make_request (url , auth , headers , redirects , max_redirects ) -> bytes :
181
+ """
182
+ Makes an HTTP request to the given URL, while following redirections.
183
+
184
+ :param url: The URL to request.
185
+ :param auth: Authentication tuple (username, password).
186
+ :param headers: Headers for the request.
187
+ :param redirects: Current number of redirects.
188
+ :param max_redirects: Maximum number of redirects to follow.
189
+ :return: The content of the response.
190
+ :raises MindeeSourceError: If max redirects are exceeded or the request fails.
191
+ """
192
+ result = requests .get (url , headers = headers , timeout = 120 , auth = auth )
193
+ if 299 < result .status_code < 400 :
194
+ if redirects == max_redirects :
195
+ raise MindeeSourceError (
196
+ f"Can't reach URL after { redirects } out of { max_redirects } redirects, "
197
+ f"aborting operation."
198
+ )
199
+ return UrlInputSource .__make_request (
200
+ redirects .location , auth , headers , redirects + 1 , max_redirects
201
+ )
202
+
203
+ if result .status_code >= 400 or result .status_code < 200 :
204
+ raise MindeeSourceError (
205
+ f"Couldn't retrieve file from server, error code { result .status_code } ."
206
+ )
207
+
208
+ return result .content
0 commit comments