Skip to content

Commit a1878f6

Browse files
committed
Merge branch 'f/snapshot-class'
1 parent ede9ba2 commit a1878f6

File tree

5 files changed

+151
-123
lines changed

5 files changed

+151
-123
lines changed

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ This script allows you to download content from the Wayback Machine (archive.org
5454

5555
#### Additional
5656

57-
- `--retry [RETRY_FAILED]`: Retry failed downloads. You can specify the number of retry attempts as an integer. If no number is provided, the script will keep retrying indefinitely.
57+
- `--retry [RETRY_FAILED]`: Retry failed downloads. You can specify the number of retry attempts as an integer.
5858
- `--worker [AMOUNT]`: The number of worker to use for downloading (simultaneous downloads). Default is 1. Beware: Using too many worker will lead into refused connections from the Wayback Machine. Duration about 1.5 minutes.
5959

6060
### Examples
@@ -74,6 +74,10 @@ Download all snapshots sorted per timestamp with a specified range and save to a
7474
List available snapshots per timestamp without downloading:<br>
7575
`waybackup -u http://example.com -f -l`
7676

77+
## Info
78+
79+
The script automatically follows redirects of snapshots.
80+
7781
## Contributing
7882

7983
I'm always happy for some feature requests to improve the usability of this script.

pywaybackup/SnapshotCollection.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
from urllib.parse import urlparse
2+
import os
3+
4+
class SnapshotCollection:
5+
6+
CDX_RESULT_JSON = []
7+
CDX_RESULT_LIST = []
8+
CDX_RESULT_COLLECTION = []
9+
10+
MODE_CURRENT = 0
11+
12+
def __init__(self, cdxResult=None, cdxCollection=None):
13+
if cdxResult:
14+
self.CDX_RESULT_JSON = cdxResult.json()[1:]
15+
self.CDX_RESULT_LIST = [{"timestamp": snapshot[0], "url": snapshot[1]} for snapshot in self.CDX_RESULT_JSON]
16+
self.CDX_RESULT_LIST = sorted(self.CDX_RESULT_LIST, key=lambda k: k['timestamp'], reverse=True)
17+
if cdxCollection:
18+
self.CDX_RESULT_COLLECTION = cdxCollection
19+
20+
def create_current(self):
21+
self.MODE_CURRENT = 1
22+
self.CDX_RESULT_LIST = sorted(self.CDX_RESULT_LIST, key=lambda k: k['timestamp'], reverse=True)
23+
cdxResult_list_filtered = []
24+
for snapshot in self.CDX_RESULT_LIST:
25+
if snapshot["url"] not in [snapshot["url"] for snapshot in cdxResult_list_filtered]:
26+
cdxResult_list_filtered.append(snapshot)
27+
self.CDX_RESULT_LIST = cdxResult_list_filtered
28+
29+
def create_collection(self, output):
30+
for snapshot in self.CDX_RESULT_LIST:
31+
timestamp, url = snapshot["timestamp"], snapshot["url"]
32+
url_type = self.__get_url_filetype(url)
33+
download_url = f"http://web.archive.org/web/{timestamp}{url_type}/{url}"
34+
domain, subdir, filename = self.__split_url(url)
35+
if self.MODE_CURRENT: download_dir = os.path.join(output, domain, subdir)
36+
else: download_dir = os.path.join(output, domain, timestamp, subdir)
37+
download_file = os.path.join(download_dir, filename)
38+
self.CDX_RESULT_COLLECTION.append(
39+
{
40+
"index": self.CDX_RESULT_LIST.index(snapshot),
41+
"url": download_url,
42+
"file": str(download_file),
43+
"success": False,
44+
"retry": 0
45+
}
46+
)
47+
48+
def count_list(self):
49+
return len(self.CDX_RESULT_LIST)
50+
51+
def count_collection(self):
52+
return len(self.CDX_RESULT_COLLECTION)
53+
54+
def set_value(self, index: int, key: str, value: str):
55+
"""
56+
Set a value in the collection
57+
58+
Args:
59+
index (int): Index of the snapshot
60+
key (str): Key of the value
61+
value (str): Value to set
62+
"""
63+
self.CDX_RESULT_COLLECTION[index][key] = value
64+
65+
def __get_url_filetype(self, url):
66+
file_extension = url.split(".")[-1]
67+
urltype_mapping = {
68+
"jpg": "im_",
69+
"jpeg": "im_",
70+
"png": "im_",
71+
"gif": "im_",
72+
"svg": "im_",
73+
"ico": "im_",
74+
"css": "cs_",
75+
"js": "js_"
76+
}
77+
urltype = urltype_mapping.get(file_extension, "id_")
78+
return urltype
79+
80+
def __split_url(self, url):
81+
parsed_url = urlparse(url)
82+
domain = parsed_url.netloc
83+
subdir = parsed_url.path.strip("/")
84+
filename = parsed_url.path.split("/")[-1] or "index.html"
85+
return domain, subdir, filename

pywaybackup/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.1"
1+
__version__ = "0.5.2"

pywaybackup/archive.py

Lines changed: 55 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from urllib.parse import urljoin
99
from datetime import datetime, timezone
1010

11+
import pywaybackup.SnapshotCollection as sc
12+
1113

1214

1315

@@ -67,13 +69,13 @@ def save_page(url: str):
6769

6870

6971

70-
def print_result(result_list):
72+
def print_result(snapshots):
7173
print("")
72-
if not result_list:
74+
if not snapshots:
7375
print("No snapshots found")
7476
else:
75-
__import__('pprint').pprint(result_list)
76-
print(f"\n-----> {len(result_list)} snapshots listed")
77+
__import__('pprint').pprint(snapshots.CDX_RESULT_LIST)
78+
print(f"\n-----> {snapshots.count_list()} snapshots listed")
7779

7880

7981

@@ -91,17 +93,10 @@ def query_list(url: str, range: int, mode: str):
9193
cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url=*.{url}/*{range}&fl=timestamp,original&filter=!statuscode:200"
9294
cdxResult = requests.get(cdxQuery)
9395
if cdxResult.status_code != 200: print(f"\n-----> ERROR: could not query snapshots, status code: {cdxResult.status_code}"); exit()
94-
cdxResult_json = cdxResult.json()[1:] # first line is fieldlist, so remove it [timestamp, original
95-
cdxResult_list = [{"timestamp": snapshot[0], "url": snapshot[1]} for snapshot in cdxResult_json]
96-
if mode == "current":
97-
cdxResult_list = sorted(cdxResult_list, key=lambda k: k['timestamp'], reverse=True)
98-
cdxResult_list_filtered = []
99-
for snapshot in cdxResult_list:
100-
if snapshot["url"] not in [snapshot["url"] for snapshot in cdxResult_list_filtered]:
101-
cdxResult_list_filtered.append(snapshot)
102-
cdxResult_list = cdxResult_list_filtered
103-
print(f"\n-----> {len(cdxResult_list)} snapshots found")
104-
return cdxResult_list
96+
snapshots = sc.SnapshotCollection(cdxResult)
97+
if mode == "current": snapshots.create_current()
98+
print(f"\n-----> {snapshots.count_list()} snapshots found")
99+
return snapshots
105100
except requests.exceptions.ConnectionError as e:
106101
print(f"\n-----> ERROR: could not query snapshots:\n{e}"); exit()
107102

@@ -110,38 +105,6 @@ def query_list(url: str, range: int, mode: str):
110105

111106

112107

113-
def split_url(url):
114-
"""
115-
Split url into domain, subdir and file.
116-
If no file is present, the filename will be index.html
117-
"""
118-
domain = url.split("//")[-1].split("/")[0]
119-
subdir = "/".join(url.split("//")[-1].split("/")[1:-1])
120-
filename = url.split("/")[-1] or "index.html"
121-
return domain, subdir, filename
122-
123-
def determine_url_filetype(url):
124-
"""
125-
Determine filetype of the archive-url by looking at the file extension.
126-
"""
127-
image = ["jpg", "jpeg", "png", "gif", "svg", "ico"]
128-
css = ["css"]
129-
js = ["js"]
130-
file_extension = url.split(".")[-1]
131-
if file_extension in image:
132-
urltype = "im_"
133-
elif file_extension in css:
134-
urltype = "cs_"
135-
elif file_extension in js:
136-
urltype = "js_"
137-
else:
138-
urltype = "id_"
139-
return urltype
140-
141-
142-
143-
144-
145108
def remove_empty_folders(path, remove_root=True):
146109
print("")
147110
print("Removing empty output folders...")
@@ -175,130 +138,108 @@ def remove_empty_folders(path, remove_root=True):
175138

176139

177140
# example download: http://web.archive.org/web/20190815104545id_/https://www.google.com/
178-
# example url: https://www.google.com/
179-
# example timestamp: 20190815104545
180-
def download_prepare_list(cdxResult_list, output, retry, worker, mode):
141+
def download_prepare_list(snapshots, output, retry, worker):
181142
"""
182143
Download a list of urls in format: [{"timestamp": "20190815104545", "url": "https://www.google.com/"}]
183144
"""
184145
print("\nDownloading latest snapshots of each file...")
185-
download_list = []
186-
for snapshot in cdxResult_list:
187-
timestamp, url = snapshot["timestamp"], snapshot["url"]
188-
type = determine_url_filetype(url)
189-
download_url = f"http://web.archive.org/web/{timestamp}{type}/{url}"
190-
domain, subdir, filename = split_url(url)
191-
if mode == "current": download_dir = os.path.join(output, domain, subdir)
192-
if mode == "full": download_dir = os.path.join(output, domain, timestamp, subdir)
193-
download_list.append({"url": download_url, "filename": filename, "filepath": download_dir})
146+
snapshots.create_collection(output)
147+
download_list = snapshots.CDX_RESULT_COLLECTION
194148
if worker > 1:
195149
print(f"\n-----> Simultaneous downloads: {worker}")
196-
batch_size = len(download_list) // worker + 1
150+
batch_size = snapshots.count_collection() // worker + 1
197151
else:
198-
batch_size = len(download_list)
152+
batch_size = snapshots.count_collection()
199153
batch_list = [download_list[i:i + batch_size] for i in range(0, len(download_list), batch_size)]
200154
threads = []
201155
worker = 0
202156
for batch in batch_list:
203157
worker += 1
204-
thread = threading.Thread(target=download_url_list, args=(batch, worker, retry))
158+
thread = threading.Thread(target=download_url_list, args=(snapshots, batch, worker, retry))
205159
threads.append(thread)
206160
thread.start()
207161
for thread in threads:
208162
thread.join()
163+
failed_urls = len([url for url in snapshots.CDX_RESULT_COLLECTION if url["success"] == False])
164+
if failed_urls: print(f"\n-----> Failed downloads: {len(failed_urls)}")
209165

210-
def download_url_list(url_list, worker, retry):
166+
def download_url_list(snapshots, url_list, worker, retry, attempt=1, connection=None):
167+
max_attempt = retry
211168
failed_urls = []
212-
connection = http.client.HTTPSConnection("web.archive.org")
213-
for url_entry in url_list:
214-
status = f"\n-----> Snapshot [{url_list.index(url_entry) + 1}/{len(url_list)}] Worker: {worker}"
215-
download_url, download_filename, download_filepath = url_entry["url"], url_entry["filename"], url_entry["filepath"]
216-
download_status=download_url_entry(download_url, download_filename, download_filepath, connection, status)
217-
if download_status != True: failed_urls.append({"url": download_url, "filename": download_filename, "filepath": download_filepath})
218-
if retry:
219-
download_retry(failed_urls, retry, connection)
220-
connection.close()
221-
222-
def download_retry(failed_urls, retry, connection):
223-
"""
224-
Retry failed downloads.
225-
failed_urls: [{"url": download_url, "filename": download_filename, "filepath": download_filepath}]
226-
retry: int or None
227-
"""
228-
attempt = 1
229-
max_attempt = retry if retry is not True else "no-limit"
230-
while failed_urls and (attempt <= retry or retry is True):
231-
print("\n-----> Retrying...")
232-
retry_urls = []
233-
for failed_entry in failed_urls:
234-
status = f"\n-----> RETRY attempt: [{attempt}/{max_attempt}] Snapshot [{failed_urls.index(failed_entry) + 1}/{len(failed_urls)}]"
235-
download_url, download_filename, download_filepath = failed_entry["url"], failed_entry["filename"], failed_entry["filepath"]
236-
retry_status=download_url_entry(download_url, download_filename, download_filepath, connection, status)
237-
if retry_status != bool(1):
238-
retry_urls.append({"url": download_url, "filename": download_filename, "filepath": download_filepath})
239-
failed_urls = retry_urls
240-
print(f"\n-----> Fail downloads: {len(failed_urls)}")
241-
if retry: attempt += 1
242-
243-
def download_url_entry(url, filename, filepath, connection, status_message):
169+
if not connection:
170+
connection = http.client.HTTPSConnection("web.archive.org")
171+
if attempt > max_attempt:
172+
connection.close()
173+
print(f"\n-----> Worker: {worker} - Failed downloads: {len(url_list)}")
174+
return
175+
else:
176+
for url_entry in url_list:
177+
status = f"\n-----> Attempt: [{attempt}/{max_attempt}] Snapshot [{url_list.index(url_entry) + 1}/{len(url_list)}] Worker: {worker}"
178+
download_status=download_url_entry(url_entry, connection, status)
179+
if download_status != True: failed_urls.append(url_entry); url_entry["retry"] += 1
180+
if download_status == True: snapshots.set_value(url_entry["index"], "success", True)
181+
attempt += 1
182+
if failed_urls: download_url_list(snapshots, failed_urls, worker, retry, attempt, connection)
183+
184+
def download_url_entry(download_entry, connection, status_message):
244185
"""
245186
Download a single URL and save it to the specified filepath.
246187
247188
Args:
248-
url (str): The URL to download.
249-
filename (str): The name of the file to save.
250-
filepath (str): The path where the file will be saved.
189+
download_url (str): The URL to download.
190+
download_file (str): The name of the file to save.
251191
connection (http.client.HTTPConnection): The HTTP connection object.
252-
status (str): The current status message.
192+
status_message (str): The current status message.
253193
254194
Returns:
255195
bool: True if the download is successful, False otherwise.
256196
"""
257-
output = os.path.join(filepath, filename)
197+
download_url = download_entry["url"]
198+
download_file = download_entry["file"]
258199
max_retries = 2
259200
sleep_time = 45
260201
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
261202
for i in range(max_retries):
262203
try:
263-
connection.request("GET", url, headers=headers)
204+
connection.request("GET", download_url, headers=headers)
264205
response = connection.getresponse()
265206
response_data = response.read()
266207
response_status = response.status
267208
if response_status == 302:
268209
status_message = f"{status_message}\n" + \
269210
f"REDIRECT -> HTTP: {response.status}"
270211
while response_status == 302:
271-
connection.request("GET", url, headers=headers)
212+
connection.request("GET", download_url, headers=headers)
272213
response = connection.getresponse()
273214
response_data = response.read()
274215
response_status = response.status
275216
location = response.getheader("Location")
276217
if location:
277218
status_message = f"{status_message}\n" + \
278219
f" -> URL: {location}"
279-
location = urljoin(url, location)
280-
url = location
220+
location = urljoin(download_url, location)
221+
download_url = location
281222
else:
282223
break
283224
if response_status != 404:
284-
os.makedirs(filepath, exist_ok=True)
285-
with open(output, 'wb') as file:
225+
os.makedirs(os.path.dirname(download_file), exist_ok=True)
226+
with open(download_file, 'wb') as file:
286227
file.write(response_data)
287228
if response_status == 200:
288229
status_message = f"{status_message}\n" + \
289230
f"SUCCESS -> HTTP: {response.status}\n" + \
290-
f" -> URL: {url}\n" + \
291-
f" -> FILE: {output}"
292-
print(status_message)
231+
f" -> URL: {download_url}\n" + \
232+
f" -> FILE: {download_file}"
293233
elif response_status == 404:
294234
status_message = f"{status_message}\n" + \
295235
f"NOT FOUND -> HTTP: {response.status}\n" + \
296-
f" -> URL: {url}"
236+
f" -> URL: {download_url}"
297237
else:
298238
status_message = f"{status_message}\n" + \
299239
f"UNEXPECTED -> HTTP: {response.status}\n" + \
300-
f" -> URL: {url}\n" + \
301-
f" -> FILE: {output}"
240+
f" -> URL: {download_url}\n" + \
241+
f" -> FILE: {download_file}"
242+
print(status_message)
302243
return True
303244
except ConnectionRefusedError as e:
304245
status_message = f"{status_message}\n" + \
@@ -308,11 +249,11 @@ def download_url_entry(url, filename, filepath, connection, status_message):
308249
time.sleep(sleep_time)
309250
except http.client.HTTPException as e:
310251
status_message = f"{status_message}\n" + \
311-
f"EXCEPTION -> ({i+1}/{max_retries}), append to failed_urls: {url}\n" + \
252+
f"EXCEPTION -> ({i+1}/{max_retries}), append to failed_urls: {download_url}\n" + \
312253
f" -> {e}"
313254
print(status_message)
314255
return False
315-
print(f"FAILED -> download, append to failed_urls: {url}")
256+
print(f"FAILED -> download, append to failed_urls: {download_url}")
316257
return False
317258

318259

0 commit comments

Comments
 (0)