1
1
#import threading
2
2
import requests
3
- import datetime
4
3
import os
5
4
import magic
6
5
import threading
7
6
import time
8
7
import http .client
9
8
from urllib .parse import urljoin
9
+ from datetime import datetime , timezone
10
+
11
+
12
+
13
+
14
+ # GET: store page to wayback machine and response with redirect to snapshot
15
+ # POST: store page to wayback machine and response with wayback machine status-page
16
+ # tag_jobid = '<script>spn.watchJob("spn2-%s", "/_static/",6000);</script>'
17
+ # tag_result_timeout = '<p>The same snapshot had been made %s minutes ago. You can make new capture of this URL after 1 hour.</p>'
18
+ # tag_result_success = ' A snapshot was captured. Visit page: <a href="%s">%s</a>'
19
+ def save_page (url : str ):
20
+ """
21
+ Saves a webpage to the Wayback Machine.
22
+
23
+ Args:
24
+ url (str): The URL of the webpage to be saved.
25
+
26
+ Returns:
27
+ None: The function does not return any value. It only prints messages to the console.
28
+ """
29
+ print ("\n Saving page to the Wayback Machine..." )
30
+ connection = http .client .HTTPSConnection ("web.archive.org" )
31
+ headers = {
32
+ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
33
+ }
34
+ connection .request ("GET" , f"https://web.archive.org/save/{ url } " , headers = headers )
35
+ print ("\n -----> Request sent" )
36
+ response = connection .getresponse ()
37
+ response_status = response .status
38
+
39
+ if response_status == 302 :
40
+ location = response .getheader ("Location" )
41
+ print ("\n -----> Response: 302 (redirect to snapshot)" )
42
+ snapshot_timestamp = datetime .strptime (location .split ('/web/' )[1 ].split ('/' )[0 ], '%Y%m%d%H%M%S' ).strftime ('%Y-%m-%d %H:%M:%S' )
43
+ current_timestamp = datetime .now (timezone .utc ).strftime ('%Y-%m-%d %H:%M:%S' )
44
+ timestamp_difference = (datetime .strptime (current_timestamp , '%Y-%m-%d %H:%M:%S' ) - datetime .strptime (snapshot_timestamp , '%Y-%m-%d %H:%M:%S' )).seconds / 60
45
+ timestamp_difference = int (round (timestamp_difference , 0 ))
46
+
47
+ if timestamp_difference < 1 :
48
+ print ("\n -----> New snapshot created" )
49
+ elif timestamp_difference > 1 :
50
+ print (f"\n -----> Snapshot already exists. (1 hour limit) - wait for { 60 - timestamp_difference } minutes" )
51
+ print (f"TIMESTAMP SNAPSHOT: { snapshot_timestamp } " )
52
+ print (f"TIMESTAMP REQUEST : { current_timestamp } " )
53
+ print (f"\n LAST SNAPSHOT BACK: { timestamp_difference } minutes" )
54
+
55
+ print (f"\n URL: { location } " )
56
+
57
+ elif response_status == 404 :
58
+ print ("\n -----> Response: 404 (not found)" )
59
+ print (f"\n FAILED -> URL: { url } " )
60
+ else :
61
+ print ("\n -----> Response: unexpected" )
62
+ print (f"\n FAILED -> URL: { url } " )
63
+
64
+ connection .close ()
65
+
66
+
67
+
68
+
10
69
11
70
def print_result (result_list ):
12
71
print ("" )
@@ -16,11 +75,15 @@ def print_result(result_list):
16
75
__import__ ('pprint' ).pprint (result_list )
17
76
print (f"\n -----> { len (result_list )} snapshots listed" )
18
77
78
+
79
+
80
+
81
+
19
82
# create filelist
20
83
def query_list (url : str , range : int , mode : str ):
21
84
print ("\n Querying snapshots..." )
22
85
if range :
23
- range = datetime .datetime . now ().year - range
86
+ range = datetime .now ().year - range
24
87
range = "&from=" + str (range )
25
88
else :
26
89
range = ""
0 commit comments