Skip to content

Commit 3f58d6c

Browse files
committed
Merge branch 'f/trim-json'
1 parent 4f12a31 commit 3f58d6c

File tree

6 files changed

+30
-38
lines changed

6 files changed

+30
-38
lines changed

README.md

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,7 @@ Internet-archive is a nice source for several OSINT-information. This script is
3131

3232
This script allows you to download content from the Wayback Machine (archive.org). You can use it to download either the latest version or all versions of web page snapshots within a specified range.
3333

34-
## Info
35-
36-
- The script will only request status code 200 snapshots (for now) - but this can differ from the status code when downloading the file.
34+
<!-- ## Info -->
3735

3836
### Arguments
3937

@@ -65,8 +63,7 @@ Specify the range in years or a specific timestamp either start, end or both. If
6563

6664
#### Additional
6765

68-
- `--redirect`: Follow redirects of snapshots. Default is False. If a source has not statuscode 200, archive.org will redirect to the closest snapshot. So when setting this to `true`, parts of a timestamp-folder may not truly belong to the given timestamp.
69-
<!-- - `--harvest`: The downloaded files are scanned for locations on the same domain. These locations (mostly resources) are then tried to be accessed within the same timestamp. Setting this to `true` may result in identical files in different timestamps but you may get a more complete snapshot of the website. -->
66+
- `--no-redirect`: Do not follow redirects of snapshots. Archive.org sometimes redirects to a different snapshot for several reasons. Downloading redirects may lead to timestamp-folders which contain some files with a different timestamp. This does not matter if you only want to download the latest version (`-c`).
7067
- `--verbosity [LEVEL]`: Set the verbosity: json (print json response), progress (show progress bar) or standard (default).
7168
- `--retry [RETRY_FAILED]`: Retry failed downloads. You can specify the number of retry attempts as an integer.
7269
- `--worker [AMOUNT]`: The number of worker to use for downloading (simultaneous downloads). Default is 1. Beware: Using too many worker will lead into refused connections from the Wayback Machine. Duration about 1.5 minutes.

pywaybackup/SnapshotCollection.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class SnapshotCollection:
99

1010
@classmethod
1111
def create_list_full(cls, cdxResult):
12-
cls.CDX_LIST = sorted([{"timestamp": snapshot[0], "url": snapshot[1]} for i, snapshot in enumerate(cdxResult.json()[1:])], key=lambda k: k['timestamp'], reverse=True)
12+
cls.CDX_LIST = sorted([{"timestamp": snapshot[0], "url": snapshot[1], "status": snapshot[2], "mimetype": snapshot[3], "digest": snapshot[4]} for i, snapshot in enumerate(cdxResult.json()[1:])], key=lambda k: k['timestamp'], reverse=True)
1313

1414
@classmethod
1515
def create_list_current(cls):
@@ -38,9 +38,7 @@ def create_collection(cls):
3838
"url_origin": url,
3939
"file": False,
4040
"redirect": False,
41-
"http_code": False,
42-
"http_message": False,
43-
"retry": False
41+
"response": False
4442
}
4543
cls.SNAPSHOT_COLLECTION.append(collection_entry)
4644

@@ -80,8 +78,8 @@ def url_get_timestamp(cls, url):
8078
"""
8179
Extract the timestamp from a wayback machine URL.
8280
"""
83-
import re
84-
timestamp = re.search(r'web.archive.org/web/(\d+)/', url).group(1)
81+
timestamp = url.split("web.archive.org/web/")[1].split("/")[0]
82+
timestamp = ''.join([char for char in timestamp if char.isdigit()])
8583
return timestamp
8684

8785
@classmethod

pywaybackup/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.0"
1+
__version__ = "0.7.1"

pywaybackup/archive.py

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def query_list(url: str, range: int, start: int, end: int, explicit: bool, mode:
9595
else:
9696
query_range = "&from=" + str(datetime.now().year - range)
9797
cdx_url = f"*.{url}/*" if not explicit else f"{url}"
98-
cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url={cdx_url}{query_range}&fl=timestamp,original,statuscode&filter!=statuscode:200"
98+
cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url={cdx_url}{query_range}&fl=timestamp,original,statuscode,mimetype,digest&filter!=statuscode:200"
9999
cdxResult = requests.get(cdxQuery)
100100
sc.create_list_full(cdxResult)
101101
sc.create_list_current() if mode == "current" else None
@@ -108,7 +108,7 @@ def query_list(url: str, range: int, start: int, end: int, explicit: bool, mode:
108108

109109

110110
# example download: http://web.archive.org/web/20190815104545id_/https://www.google.com/
111-
def download_list(output, retry, redirect, worker):
111+
def download_list(output, retry, no_redirect, worker):
112112
"""
113113
Download a list of urls in format: [{"timestamp": "20190815104545", "url": "https://www.google.com/"}]
114114
"""
@@ -127,13 +127,13 @@ def download_list(output, retry, redirect, worker):
127127
worker = 0
128128
for batch in batch_list:
129129
worker += 1
130-
thread = threading.Thread(target=download_loop, args=(batch, output, worker, retry, redirect))
130+
thread = threading.Thread(target=download_loop, args=(batch, output, worker, retry, no_redirect))
131131
threads.append(thread)
132132
thread.start()
133133
for thread in threads:
134134
thread.join()
135135

136-
def download_loop(snapshot_batch, output, worker, retry, redirect, attempt=1, connection=None):
136+
def download_loop(snapshot_batch, output, worker, retry, no_redirect, attempt=1, connection=None):
137137
"""
138138
Download a list of URLs in a recursive loop. If a download fails, the function will retry the download.
139139
The "snapshot_collection" dictionary will be updated with the download status and file information.
@@ -149,9 +149,8 @@ def download_loop(snapshot_batch, output, worker, retry, redirect, attempt=1, co
149149
return
150150
for snapshot in snapshot_batch:
151151
status = f"\n-----> Attempt: [{attempt}/{max_attempt}] Snapshot [{snapshot_batch.index(snapshot)+1}/{len(snapshot_batch)}] - Worker: {worker}"
152-
download_status = download(output, snapshot, connection, status, redirect)
152+
download_status = download(output, snapshot, connection, status, no_redirect)
153153
if not download_status:
154-
if retry > 0: sc.snapshot_entry_modify(snapshot, "retry", attempt)
155154
failed_urls.append(snapshot)
156155
if download_status:
157156
v.write(progress=1)
@@ -160,9 +159,9 @@ def download_loop(snapshot_batch, output, worker, retry, redirect, attempt=1, co
160159
if not attempt > max_attempt:
161160
v.write(f"\n-----> Worker: {worker} - Retry Timeout: 10 seconds")
162161
time.sleep(15)
163-
download_loop(failed_urls, output, worker, retry, redirect, attempt, connection)
162+
download_loop(failed_urls, output, worker, retry, no_redirect, attempt, connection)
164163

165-
def download(output, snapshot_entry, connection, status_message, redirect=False):
164+
def download(output, snapshot_entry, connection, status_message, no_redirect=False):
166165
"""
167166
Download a single URL and save it to the specified filepath.
168167
If there is a redirect, the function will follow the redirect and update the download URL.
@@ -180,9 +179,8 @@ def download(output, snapshot_entry, connection, status_message, redirect=False)
180179
response_data = response.read()
181180
response_status = response.status
182181
response_status_message = parse_response_code(response_status)
183-
sc.snapshot_entry_modify(snapshot_entry, "http_code", response_status)
184-
sc.snapshot_entry_modify(snapshot_entry, "http_message", response_status_message)
185-
if redirect:
182+
sc.snapshot_entry_modify(snapshot_entry, "response", response_status)
183+
if not no_redirect:
186184
if response_status == 302:
187185
status_message = f"{status_message}\n" + \
188186
f"REDIRECT -> HTTP: {response.status}"
@@ -242,20 +240,20 @@ def download(output, snapshot_entry, connection, status_message, redirect=False)
242240
v.write(f"FAILED -> download, append to failed_urls: {download_url}")
243241
return False
244242

243+
RESPONSE_CODE_DICT = {
244+
200: "OK",
245+
301: "Moved Permanently",
246+
302: "Found (redirect)",
247+
400: "Bad Request",
248+
403: "Forbidden",
249+
404: "Not Found",
250+
500: "Internal Server Error",
251+
503: "Service Unavailable"
252+
}
245253
def parse_response_code(response_code: int):
246254
"""
247255
Parse the response code of the Wayback Machine and return a human-readable message.
248256
"""
249-
response_code_dict = {
250-
200: "OK",
251-
301: "Moved Permanently",
252-
302: "Found (redirect)",
253-
400: "Bad Request",
254-
403: "Forbidden",
255-
404: "Not Found",
256-
500: "Internal Server Error",
257-
503: "Service Unavailable"
258-
}
259-
if response_code in response_code_dict:
260-
return response_code_dict[response_code]
257+
if response_code in RESPONSE_CODE_DICT:
258+
return RESPONSE_CODE_DICT[response_code]
261259
return "Unknown response code"

pywaybackup/arguments.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@ def parse():
2222
optional.add_argument('--end', type=int, help='End timestamp format: YYYYMMDDhhmmss')
2323

2424
special = parser.add_argument_group('special')
25-
special.add_argument('--redirect', action='store_true', help='Follow redirects by archive.org')
26-
# special.add_argument('--harvest', action='store_true', help='Harvest location tags from snapshots and try to get as much as possible')
25+
special.add_argument('--no-redirect', action='store_true', help='Do not follow redirects by archive.org')
2726
special.add_argument('--verbosity', type=str, default="standard", choices=["standard", "progress", "json"], help='Verbosity level')
2827
special.add_argument('--retry', type=int, default=0, metavar="X-TIMES", help='Retry failed downloads (opt tries as int, else infinite)')
2928
special.add_argument('--worker', type=int, default=1, metavar="AMOUNT", help='Number of worker (simultaneous downloads)')

pywaybackup/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def main():
2323
if args.list:
2424
archive.print_list()
2525
else:
26-
archive.download_list(args.output, args.retry, args.redirect, args.worker)
26+
archive.download_list(args.output, args.retry, args.no_redirect, args.worker)
2727
#archive.remove_empty_folders(args.output)
2828
v.close()
2929

0 commit comments

Comments
 (0)