14
14
15
15
16
16
17
+
17
18
# GET: store page to wayback machine and response with redirect to snapshot
18
19
# POST: store page to wayback machine and response with wayback machine status-page
19
20
# tag_jobid = '<script>spn.watchJob("spn2-%s", "/_static/",6000);</script>'
@@ -69,7 +70,6 @@ def save_page(url: str):
69
70
70
71
71
72
72
-
73
73
def print_list ():
74
74
v .write ("" )
75
75
count = sc .count_list ()
@@ -146,6 +146,10 @@ def download_list(output, retry, no_redirect, workers):
146
146
for thread in threads :
147
147
thread .join ()
148
148
149
+
150
+
151
+
152
+
149
153
def download_loop (snapshot_batch , output , worker , retry , no_redirect , attempt = 1 , connection = None ):
150
154
"""
151
155
Download a list of URLs in a recursive loop. If a download fails, the function will retry the download.
@@ -174,6 +178,10 @@ def download_loop(snapshot_batch, output, worker, retry, no_redirect, attempt=1,
174
178
time .sleep (15 )
175
179
download_loop (failed_urls , output , worker , retry , no_redirect , attempt , connection )
176
180
181
+
182
+
183
+
184
+
177
185
def download (output , snapshot_entry , connection , status_message , no_redirect = False ):
178
186
"""
179
187
Download a single URL and save it to the specified filepath.
@@ -202,20 +210,21 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
202
210
response = connection .getresponse ()
203
211
response_data = response .read ()
204
212
response_status = response .status
213
+ response_status_message = parse_response_code (response_status )
205
214
location = response .getheader ("Location" )
206
215
if location :
207
- status_message = f"{ status_message } \n " + \
208
- f" -> URL: { location } "
209
216
location = urljoin (download_url , location )
210
217
download_url = location
218
+ status_message = f"{ status_message } \n " + \
219
+ f" -> URL: { download_url } "
211
220
sc .snapshot_entry_modify (snapshot_entry , "redirect_timestamp" , sc .url_get_timestamp (location ))
212
221
sc .snapshot_entry_modify (snapshot_entry , "redirect_url" , location )
213
222
else :
214
223
break
215
224
if response_status == 200 :
216
- sc . snapshot_entry_modify ( snapshot_entry , "file" , sc .snapshot_entry_create_output (snapshot_entry , output ) )
217
- download_file = snapshot_entry [ "file" ]
218
- os .makedirs (os . path . dirname ( download_file ) , exist_ok = True )
225
+ download_file = sc .snapshot_entry_create_output (snapshot_entry , output )
226
+ download_path = os . path . dirname ( download_file )
227
+ os .makedirs (download_path , exist_ok = True )
219
228
with open (download_file , 'wb' ) as file :
220
229
if response .getheader ('Content-Encoding' ) == 'gzip' :
221
230
response_data = gzip .decompress (response_data )
@@ -227,12 +236,13 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
227
236
f"SUCCESS -> HTTP: { response_status } - { response_status_message } \n " + \
228
237
f" -> URL: { download_url } \n " + \
229
238
f" -> FILE: { download_file } "
239
+ sc .snapshot_entry_modify (snapshot_entry , "file" , download_file )
230
240
v .write (status_message )
231
241
return True
232
242
else :
233
243
status_message = f"{ status_message } \n " + \
234
244
f"UNEXPECTED -> HTTP: { response_status } - { response_status_message } \n " + \
235
- f" -> URL: { download_url } \n "
245
+ f" -> URL: { download_url } "
236
246
v .write (status_message )
237
247
return True
238
248
# exception returns false and appends the url to the failed list
@@ -262,6 +272,10 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
262
272
500 : "Internal Server Error" ,
263
273
503 : "Service Unavailable"
264
274
}
275
+
276
+
277
+
278
+
265
279
def parse_response_code (response_code : int ):
266
280
"""
267
281
Parse the response code of the Wayback Machine and return a human-readable message.
@@ -270,6 +284,10 @@ def parse_response_code(response_code: int):
270
284
return RESPONSE_CODE_DICT [response_code ]
271
285
return "Unknown response code"
272
286
287
+
288
+
289
+
290
+
273
291
def save_csv (csv_path : str , url : str ):
274
292
"""
275
293
Write a CSV file with the list of snapshots.
@@ -285,4 +303,4 @@ def save_csv(csv_path: str, url: str):
285
303
row = csv .DictWriter (file , sc .SNAPSHOT_COLLECTION [0 ].keys ())
286
304
row .writeheader ()
287
305
for snapshot in sc .SNAPSHOT_COLLECTION :
288
- row .writerow (snapshot )
306
+ row .writerow (snapshot )
0 commit comments