Skip to content

Commit 532e136

Browse files
author
bitdruid@vbox
committed
Merge branch 'f/save-pages-to-archive'
1 parent b9b964d commit 532e136

File tree

5 files changed

+86
-17
lines changed

5 files changed

+86
-17
lines changed

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ Internet-archive is a nice source for several OSINT-information. This script is
2222
### Manual
2323

2424
1. Clone the repository <br>
25-
```git clone https://github.com/bitdruid/waybackup.git```
25+
```git clone https://github.com/bitdruid/python-wayback-machine-downloader.git```
2626
2. Install <br>
2727
```pip install .```
2828
- in a virtual env or use `--break-system-package`
@@ -42,8 +42,9 @@ This script allows you to download content from the Wayback Machine (archive.org
4242

4343
#### Mode Selection (Choose One)
4444

45-
- `-c`, `--current`: Download the latest version of each file snapshot. This option is mutually exclusive with `-f/--full`.
46-
- `-f`, `--full`: Download snapshots of all timestamps. This option is mutually exclusive with `-c/--current`.
45+
- `-c`, `--current`: Download the latest version of each file snapshot.
46+
- `-f`, `--full`: Download snapshots of all timestamps.
47+
- `-s`, `--save`: Save a page to the Wayback Machine. (beta)
4748

4849
#### Optional Arguments
4950

pywaybackup/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.2"
1+
__version__ = "0.5.0"

pywaybackup/archive.py

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,71 @@
11
#import threading
22
import requests
3-
import datetime
43
import os
54
import magic
65
import threading
76
import time
87
import http.client
98
from urllib.parse import urljoin
9+
from datetime import datetime, timezone
10+
11+
12+
13+
14+
# GET: store page to wayback machine and response with redirect to snapshot
15+
# POST: store page to wayback machine and response with wayback machine status-page
16+
# tag_jobid = '<script>spn.watchJob("spn2-%s", "/_static/",6000);</script>'
17+
# tag_result_timeout = '<p>The same snapshot had been made %s minutes ago. You can make new capture of this URL after 1 hour.</p>'
18+
# tag_result_success = ' A snapshot was captured. Visit page: <a href="%s">%s</a>'
19+
def save_page(url: str):
20+
"""
21+
Saves a webpage to the Wayback Machine.
22+
23+
Args:
24+
url (str): The URL of the webpage to be saved.
25+
26+
Returns:
27+
None: The function does not return any value. It only prints messages to the console.
28+
"""
29+
print("\nSaving page to the Wayback Machine...")
30+
connection = http.client.HTTPSConnection("web.archive.org")
31+
headers = {
32+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
33+
}
34+
connection.request("GET", f"https://web.archive.org/save/{url}", headers=headers)
35+
print("\n-----> Request sent")
36+
response = connection.getresponse()
37+
response_status = response.status
38+
39+
if response_status == 302:
40+
location = response.getheader("Location")
41+
print("\n-----> Response: 302 (redirect to snapshot)")
42+
snapshot_timestamp = datetime.strptime(location.split('/web/')[1].split('/')[0], '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S')
43+
current_timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
44+
timestamp_difference = (datetime.strptime(current_timestamp, '%Y-%m-%d %H:%M:%S') - datetime.strptime(snapshot_timestamp, '%Y-%m-%d %H:%M:%S')).seconds / 60
45+
timestamp_difference = int(round(timestamp_difference, 0))
46+
47+
if timestamp_difference < 1:
48+
print("\n-----> New snapshot created")
49+
elif timestamp_difference > 1:
50+
print(f"\n-----> Snapshot already exists. (1 hour limit) - wait for {60 - timestamp_difference} minutes")
51+
print(f"TIMESTAMP SNAPSHOT: {snapshot_timestamp}")
52+
print(f"TIMESTAMP REQUEST : {current_timestamp}")
53+
print(f"\nLAST SNAPSHOT BACK: {timestamp_difference} minutes")
54+
55+
print(f"\nURL: {location}")
56+
57+
elif response_status == 404:
58+
print("\n-----> Response: 404 (not found)")
59+
print(f"\nFAILED -> URL: {url}")
60+
else:
61+
print("\n-----> Response: unexpected")
62+
print(f"\nFAILED -> URL: {url}")
63+
64+
connection.close()
65+
66+
67+
68+
1069

1170
def print_result(result_list):
1271
print("")
@@ -16,11 +75,15 @@ def print_result(result_list):
1675
__import__('pprint').pprint(result_list)
1776
print(f"\n-----> {len(result_list)} snapshots listed")
1877

78+
79+
80+
81+
1982
# create filelist
2083
def query_list(url: str, range: int, mode: str):
2184
print("\nQuerying snapshots...")
2285
if range:
23-
range = datetime.datetime.now().year - range
86+
range = datetime.now().year - range
2487
range = "&from=" + str(range)
2588
else:
2689
range = ""

pywaybackup/main.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ def main():
1212
exclusive_required = required.add_mutually_exclusive_group(required=True)
1313
exclusive_required.add_argument('-c', '--current', action='store_true', help='Download the latest version of each file snapshot (opt range in y)')
1414
exclusive_required.add_argument('-f', '--full', action='store_true', help='Download snapshots of all timestamps (opt range in y)')
15+
exclusive_required.add_argument('-s', '--save', action='store_true', help='Save a page to the wayback machine')
1516
optional = parser.add_argument_group('optional')
1617
optional.add_argument('-l', '--list', action='store_true', help='Only print snapshots (opt range in y)')
1718
optional.add_argument('-r', '--range', type=int, help='Range in years to search')
@@ -27,16 +28,20 @@ def main():
2728
if args.full:
2829
mode = "full"
2930

30-
if args.output is None:
31-
args.output = os.path.join(os.getcwd(), "waybackup_snapshots")
32-
cdxResult_list = archive.query_list(args.url, args.range, mode)
33-
if args.list:
34-
archive.print_result(cdxResult_list)
35-
if not args.list:
36-
archive.download_prepare_list(cdxResult_list, args.output, args.retry, args.worker, mode)
37-
archive.remove_empty_folders(args.output)
38-
# if args.detect_filetype:
39-
# archive.detect_filetype(args.output)
31+
if args.save:
32+
archive.save_page(args.url)
33+
else:
34+
if args.output is None:
35+
args.output = os.path.join(os.getcwd(), "waybackup_snapshots")
36+
cdxResult_list = archive.query_list(args.url, args.range, mode)
37+
if args.list:
38+
archive.print_result(cdxResult_list)
39+
if not args.list and not args.save:
40+
archive.download_prepare_list(cdxResult_list, args.output, args.retry, args.worker, mode)
41+
archive.remove_empty_folders(args.output)
42+
# if args.detect_filetype:
43+
# archive.detect_filetype(args.output)
44+
print("")
4045

4146
if __name__ == "__main__":
4247
main()

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,5 @@ def parse_requirements(filename):
3131
long_description_content_type='text/markdown',
3232
license='MIT',
3333
keywords='wayback machine internet archive',
34-
url='https://github.com/bitdruid/waybackup',
34+
url='https://github.com/bitdruid/python-wayback-machine-downloader',
3535
)

0 commit comments

Comments
 (0)