8
8
from urllib .parse import urljoin
9
9
from datetime import datetime , timezone
10
10
11
+ import pywaybackup .SnapshotCollection as sc
12
+
11
13
12
14
13
15
@@ -67,13 +69,13 @@ def save_page(url: str):
67
69
68
70
69
71
70
- def print_result (result_list ):
72
+ def print_result (snapshots ):
71
73
print ("" )
72
- if not result_list :
74
+ if not snapshots :
73
75
print ("No snapshots found" )
74
76
else :
75
- __import__ ('pprint' ).pprint (result_list )
76
- print (f"\n -----> { len ( result_list )} snapshots listed" )
77
+ __import__ ('pprint' ).pprint (snapshots . CDX_RESULT_LIST )
78
+ print (f"\n -----> { snapshots . count_list ( )} snapshots listed" )
77
79
78
80
79
81
@@ -91,17 +93,10 @@ def query_list(url: str, range: int, mode: str):
91
93
cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url=*.{ url } /*{ range } &fl=timestamp,original&filter=!statuscode:200"
92
94
cdxResult = requests .get (cdxQuery )
93
95
if cdxResult .status_code != 200 : print (f"\n -----> ERROR: could not query snapshots, status code: { cdxResult .status_code } " ); exit ()
94
- cdxResult_json = cdxResult .json ()[1 :] # first line is fieldlist, so remove it [timestamp, original
95
- cdxResult_list = [{"timestamp" : snapshot [0 ], "url" : snapshot [1 ]} for snapshot in cdxResult_json ]
96
- if mode == "current" :
97
- cdxResult_list = sorted (cdxResult_list , key = lambda k : k ['timestamp' ], reverse = True )
98
- cdxResult_list_filtered = []
99
- for snapshot in cdxResult_list :
100
- if snapshot ["url" ] not in [snapshot ["url" ] for snapshot in cdxResult_list_filtered ]:
101
- cdxResult_list_filtered .append (snapshot )
102
- cdxResult_list = cdxResult_list_filtered
103
- print (f"\n -----> { len (cdxResult_list )} snapshots found" )
104
- return cdxResult_list
96
+ snapshots = sc .SnapshotCollection (cdxResult )
97
+ if mode == "current" : snapshots .create_current ()
98
+ print (f"\n -----> { snapshots .count_list ()} snapshots found" )
99
+ return snapshots
105
100
except requests .exceptions .ConnectionError as e :
106
101
print (f"\n -----> ERROR: could not query snapshots:\n { e } " ); exit ()
107
102
@@ -110,38 +105,6 @@ def query_list(url: str, range: int, mode: str):
110
105
111
106
112
107
113
- def split_url (url ):
114
- """
115
- Split url into domain, subdir and file.
116
- If no file is present, the filename will be index.html
117
- """
118
- domain = url .split ("//" )[- 1 ].split ("/" )[0 ]
119
- subdir = "/" .join (url .split ("//" )[- 1 ].split ("/" )[1 :- 1 ])
120
- filename = url .split ("/" )[- 1 ] or "index.html"
121
- return domain , subdir , filename
122
-
123
- def determine_url_filetype (url ):
124
- """
125
- Determine filetype of the archive-url by looking at the file extension.
126
- """
127
- image = ["jpg" , "jpeg" , "png" , "gif" , "svg" , "ico" ]
128
- css = ["css" ]
129
- js = ["js" ]
130
- file_extension = url .split ("." )[- 1 ]
131
- if file_extension in image :
132
- urltype = "im_"
133
- elif file_extension in css :
134
- urltype = "cs_"
135
- elif file_extension in js :
136
- urltype = "js_"
137
- else :
138
- urltype = "id_"
139
- return urltype
140
-
141
-
142
-
143
-
144
-
145
108
def remove_empty_folders (path , remove_root = True ):
146
109
print ("" )
147
110
print ("Removing empty output folders..." )
@@ -175,130 +138,108 @@ def remove_empty_folders(path, remove_root=True):
175
138
176
139
177
140
# example download: http://web.archive.org/web/20190815104545id_/https://www.google.com/
178
- # example url: https://www.google.com/
179
- # example timestamp: 20190815104545
180
- def download_prepare_list (cdxResult_list , output , retry , worker , mode ):
141
+ def download_prepare_list (snapshots , output , retry , worker ):
181
142
"""
182
143
Download a list of urls in format: [{"timestamp": "20190815104545", "url": "https://www.google.com/"}]
183
144
"""
184
145
print ("\n Downloading latest snapshots of each file..." )
185
- download_list = []
186
- for snapshot in cdxResult_list :
187
- timestamp , url = snapshot ["timestamp" ], snapshot ["url" ]
188
- type = determine_url_filetype (url )
189
- download_url = f"http://web.archive.org/web/{ timestamp } { type } /{ url } "
190
- domain , subdir , filename = split_url (url )
191
- if mode == "current" : download_dir = os .path .join (output , domain , subdir )
192
- if mode == "full" : download_dir = os .path .join (output , domain , timestamp , subdir )
193
- download_list .append ({"url" : download_url , "filename" : filename , "filepath" : download_dir })
146
+ snapshots .create_collection (output )
147
+ download_list = snapshots .CDX_RESULT_COLLECTION
194
148
if worker > 1 :
195
149
print (f"\n -----> Simultaneous downloads: { worker } " )
196
- batch_size = len ( download_list ) // worker + 1
150
+ batch_size = snapshots . count_collection ( ) // worker + 1
197
151
else :
198
- batch_size = len ( download_list )
152
+ batch_size = snapshots . count_collection ( )
199
153
batch_list = [download_list [i :i + batch_size ] for i in range (0 , len (download_list ), batch_size )]
200
154
threads = []
201
155
worker = 0
202
156
for batch in batch_list :
203
157
worker += 1
204
- thread = threading .Thread (target = download_url_list , args = (batch , worker , retry ))
158
+ thread = threading .Thread (target = download_url_list , args = (snapshots , batch , worker , retry ))
205
159
threads .append (thread )
206
160
thread .start ()
207
161
for thread in threads :
208
162
thread .join ()
163
+ failed_urls = len ([url for url in snapshots .CDX_RESULT_COLLECTION if url ["success" ] == False ])
164
+ if failed_urls : print (f"\n -----> Failed downloads: { len (failed_urls )} " )
209
165
210
- def download_url_list (url_list , worker , retry ):
166
+ def download_url_list (snapshots , url_list , worker , retry , attempt = 1 , connection = None ):
167
+ max_attempt = retry
211
168
failed_urls = []
212
- connection = http .client .HTTPSConnection ("web.archive.org" )
213
- for url_entry in url_list :
214
- status = f"\n -----> Snapshot [{ url_list .index (url_entry ) + 1 } /{ len (url_list )} ] Worker: { worker } "
215
- download_url , download_filename , download_filepath = url_entry ["url" ], url_entry ["filename" ], url_entry ["filepath" ]
216
- download_status = download_url_entry (download_url , download_filename , download_filepath , connection , status )
217
- if download_status != True : failed_urls .append ({"url" : download_url , "filename" : download_filename , "filepath" : download_filepath })
218
- if retry :
219
- download_retry (failed_urls , retry , connection )
220
- connection .close ()
221
-
222
- def download_retry (failed_urls , retry , connection ):
223
- """
224
- Retry failed downloads.
225
- failed_urls: [{"url": download_url, "filename": download_filename, "filepath": download_filepath}]
226
- retry: int or None
227
- """
228
- attempt = 1
229
- max_attempt = retry if retry is not True else "no-limit"
230
- while failed_urls and (attempt <= retry or retry is True ):
231
- print ("\n -----> Retrying..." )
232
- retry_urls = []
233
- for failed_entry in failed_urls :
234
- status = f"\n -----> RETRY attempt: [{ attempt } /{ max_attempt } ] Snapshot [{ failed_urls .index (failed_entry ) + 1 } /{ len (failed_urls )} ]"
235
- download_url , download_filename , download_filepath = failed_entry ["url" ], failed_entry ["filename" ], failed_entry ["filepath" ]
236
- retry_status = download_url_entry (download_url , download_filename , download_filepath , connection , status )
237
- if retry_status != bool (1 ):
238
- retry_urls .append ({"url" : download_url , "filename" : download_filename , "filepath" : download_filepath })
239
- failed_urls = retry_urls
240
- print (f"\n -----> Fail downloads: { len (failed_urls )} " )
241
- if retry : attempt += 1
242
-
243
- def download_url_entry (url , filename , filepath , connection , status_message ):
169
+ if not connection :
170
+ connection = http .client .HTTPSConnection ("web.archive.org" )
171
+ if attempt > max_attempt :
172
+ connection .close ()
173
+ print (f"\n -----> Worker: { worker } - Failed downloads: { len (url_list )} " )
174
+ return
175
+ else :
176
+ for url_entry in url_list :
177
+ status = f"\n -----> Attempt: [{ attempt } /{ max_attempt } ] Snapshot [{ url_list .index (url_entry ) + 1 } /{ len (url_list )} ] Worker: { worker } "
178
+ download_status = download_url_entry (url_entry , connection , status )
179
+ if download_status != True : failed_urls .append (url_entry ); url_entry ["retry" ] += 1
180
+ if download_status == True : snapshots .set_value (url_entry ["index" ], "success" , True )
181
+ attempt += 1
182
+ if failed_urls : download_url_list (snapshots , failed_urls , worker , retry , attempt , connection )
183
+
184
+ def download_url_entry (download_entry , connection , status_message ):
244
185
"""
245
186
Download a single URL and save it to the specified filepath.
246
187
247
188
Args:
248
- url (str): The URL to download.
249
- filename (str): The name of the file to save.
250
- filepath (str): The path where the file will be saved.
189
+ download_url (str): The URL to download.
190
+ download_file (str): The name of the file to save.
251
191
connection (http.client.HTTPConnection): The HTTP connection object.
252
- status (str): The current status message.
192
+ status_message (str): The current status message.
253
193
254
194
Returns:
255
195
bool: True if the download is successful, False otherwise.
256
196
"""
257
- output = os .path .join (filepath , filename )
197
+ download_url = download_entry ["url" ]
198
+ download_file = download_entry ["file" ]
258
199
max_retries = 2
259
200
sleep_time = 45
260
201
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36' }
261
202
for i in range (max_retries ):
262
203
try :
263
- connection .request ("GET" , url , headers = headers )
204
+ connection .request ("GET" , download_url , headers = headers )
264
205
response = connection .getresponse ()
265
206
response_data = response .read ()
266
207
response_status = response .status
267
208
if response_status == 302 :
268
209
status_message = f"{ status_message } \n " + \
269
210
f"REDIRECT -> HTTP: { response .status } "
270
211
while response_status == 302 :
271
- connection .request ("GET" , url , headers = headers )
212
+ connection .request ("GET" , download_url , headers = headers )
272
213
response = connection .getresponse ()
273
214
response_data = response .read ()
274
215
response_status = response .status
275
216
location = response .getheader ("Location" )
276
217
if location :
277
218
status_message = f"{ status_message } \n " + \
278
219
f" -> URL: { location } "
279
- location = urljoin (url , location )
280
- url = location
220
+ location = urljoin (download_url , location )
221
+ download_url = location
281
222
else :
282
223
break
283
224
if response_status != 404 :
284
- os .makedirs (filepath , exist_ok = True )
285
- with open (output , 'wb' ) as file :
225
+ os .makedirs (os . path . dirname ( download_file ) , exist_ok = True )
226
+ with open (download_file , 'wb' ) as file :
286
227
file .write (response_data )
287
228
if response_status == 200 :
288
229
status_message = f"{ status_message } \n " + \
289
230
f"SUCCESS -> HTTP: { response .status } \n " + \
290
- f" -> URL: { url } \n " + \
291
- f" -> FILE: { output } "
292
- print (status_message )
231
+ f" -> URL: { download_url } \n " + \
232
+ f" -> FILE: { download_file } "
293
233
elif response_status == 404 :
294
234
status_message = f"{ status_message } \n " + \
295
235
f"NOT FOUND -> HTTP: { response .status } \n " + \
296
- f" -> URL: { url } "
236
+ f" -> URL: { download_url } "
297
237
else :
298
238
status_message = f"{ status_message } \n " + \
299
239
f"UNEXPECTED -> HTTP: { response .status } \n " + \
300
- f" -> URL: { url } \n " + \
301
- f" -> FILE: { output } "
240
+ f" -> URL: { download_url } \n " + \
241
+ f" -> FILE: { download_file } "
242
+ print (status_message )
302
243
return True
303
244
except ConnectionRefusedError as e :
304
245
status_message = f"{ status_message } \n " + \
@@ -308,11 +249,11 @@ def download_url_entry(url, filename, filepath, connection, status_message):
308
249
time .sleep (sleep_time )
309
250
except http .client .HTTPException as e :
310
251
status_message = f"{ status_message } \n " + \
311
- f"EXCEPTION -> ({ i + 1 } /{ max_retries } ), append to failed_urls: { url } \n " + \
252
+ f"EXCEPTION -> ({ i + 1 } /{ max_retries } ), append to failed_urls: { download_url } \n " + \
312
253
f" -> { e } "
313
254
print (status_message )
314
255
return False
315
- print (f"FAILED -> download, append to failed_urls: { url } " )
256
+ print (f"FAILED -> download, append to failed_urls: { download_url } " )
316
257
return False
317
258
318
259
0 commit comments