Skip to content

Commit cff8c6d

Browse files
committed
Merge branch 'f/csv-output'
1 parent 3f58d6c commit cff8c6d

File tree

7 files changed

+66
-42
lines changed

7 files changed

+66
-42
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,11 @@ Specify the range in years or a specific timestamp either start, end or both. If
6363

6464
#### Additional
6565

66+
- `--csv`: Save a csv file with the list of snapshots inside the output folder.
6667
- `--no-redirect`: Do not follow redirects of snapshots. Archive.org sometimes redirects to a different snapshot for several reasons. Downloading redirects may lead to timestamp-folders which contain some files with a different timestamp. This does not matter if you only want to download the latest version (`-c`).
6768
- `--verbosity [LEVEL]`: Set the verbosity: json (print json response), progress (show progress bar) or standard (default).
6869
- `--retry [RETRY_FAILED]`: Retry failed downloads. You can specify the number of retry attempts as an integer.
69-
- `--worker [AMOUNT]`: The number of worker to use for downloading (simultaneous downloads). Default is 1. Beware: Using too many worker will lead into refused connections from the Wayback Machine. Duration about 1.5 minutes.
70+
- `--worker [AMOUNT]`: The number of worker to use for downloading (simultaneous downloads). Default is 1. A safe spot is about 10 workers. Beware: Using too many worker will lead into refused connections from the Wayback Machine. Duration about 1.5 minutes.
7071

7172
### Examples
7273

pywaybackup/SnapshotCollection.py

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,44 +3,54 @@
33

44
class SnapshotCollection:
55

6-
CDX_LIST = []
76
SNAPSHOT_COLLECTION = []
87
MODE_CURRENT = 0
98

109
@classmethod
11-
def create_list_full(cls, cdxResult):
12-
cls.CDX_LIST = sorted([{"timestamp": snapshot[0], "url": snapshot[1], "status": snapshot[2], "mimetype": snapshot[3], "digest": snapshot[4]} for i, snapshot in enumerate(cdxResult.json()[1:])], key=lambda k: k['timestamp'], reverse=True)
13-
14-
@classmethod
15-
def create_list_current(cls):
16-
cls.MODE_CURRENT = 1
17-
cdxResult_list_filtered = []
18-
url_set = set()
19-
for snapshot in cls.CDX_LIST:
20-
if snapshot["url"] not in url_set:
21-
cdxResult_list_filtered.append(snapshot)
22-
url_set.add(snapshot["url"])
23-
cls.CDX_LIST = cdxResult_list_filtered
10+
def create_list(cls, cdxResult, mode):
11+
"""
12+
Create the snapshot collection list from a cdx result.
2413
14+
- mode `full`: All snapshots are included.
15+
- mode `current`: Only the latest snapshot of each file is included.
16+
"""
17+
# creates a list of dictionaries for each snapshot entry
18+
cls.SNAPSHOT_COLLECTION = sorted([{"timestamp": snapshot[0], "digest": snapshot[1], "mimetype": snapshot[2], "status": snapshot[3], "url": snapshot[4]} for snapshot in cdxResult.json()[1:]], key=lambda k: k['timestamp'], reverse=True)
19+
if mode == "current":
20+
cls.MODE_CURRENT = 1
21+
cdxResult_list_filtered = []
22+
url_set = set()
23+
# filters the list to only include the latest snapshot of each file
24+
for snapshot in cls.SNAPSHOT_COLLECTION:
25+
if snapshot["url"] not in url_set:
26+
cdxResult_list_filtered.append(snapshot)
27+
url_set.add(snapshot["url"])
28+
cls.SNAPSHOT_COLLECTION = cdxResult_list_filtered
29+
# writes the index for each snapshot entry
30+
cls.SNAPSHOT_COLLECTION = [{"id": idx, **entry} for idx, entry in enumerate(cls.SNAPSHOT_COLLECTION)]
31+
2532
@classmethod
2633
def count_list(cls):
27-
return len(cls.CDX_LIST)
34+
return len(cls.SNAPSHOT_COLLECTION)
2835

2936
@classmethod
3037
def create_collection(cls):
31-
for cdx_entry in cls.CDX_LIST:
38+
new_collection = []
39+
for cdx_entry in cls.SNAPSHOT_COLLECTION:
3240
timestamp, url = cdx_entry["timestamp"], cdx_entry["url"]
3341
url_archive = f"http://web.archive.org/web/{timestamp}{cls._url_get_filetype(url)}/{url}"
3442
collection_entry = {
35-
"id": len(cls.SNAPSHOT_COLLECTION),
43+
"id": cls.SNAPSHOT_COLLECTION.index(cdx_entry),
3644
"timestamp": timestamp,
3745
"url_archive": url_archive,
3846
"url_origin": url,
39-
"file": False,
40-
"redirect": False,
41-
"response": False
47+
"redirect_url": False,
48+
"redirect_timestamp": False,
49+
"response": False,
50+
"file": False
4251
}
43-
cls.SNAPSHOT_COLLECTION.append(collection_entry)
52+
new_collection.append(collection_entry)
53+
cls.SNAPSHOT_COLLECTION = new_collection
4454

4555
@classmethod
4656
def snapshot_entry_create_output(cls, collection_entry: dict, output: str) -> str:
@@ -60,7 +70,7 @@ def snapshot_entry_create_output(cls, collection_entry: dict, output: str) -> st
6070
download_dir = os.path.join(output, domain, subdir)
6171
else:
6272
download_dir = os.path.join(output, domain, timestamp, subdir)
63-
download_file = os.path.join(download_dir, filename)
73+
download_file = os.path.abspath(os.path.join(download_dir, filename))
6474
return download_file
6575

6676
@classmethod

pywaybackup/Verbosity.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
class Verbosity:
66

7-
snapshots = None
87
mode = None
98
args = None
109
pbar = None
@@ -22,10 +21,10 @@ def open(cls, args: list):
2221
@classmethod
2322
def close(cls):
2423
if cls.mode == "progress":
25-
cls.pbar.close()
24+
if cls.pbar is not None: cls.pbar.close()
2625
if cls.mode == "progress" or cls.mode == "standard":
27-
successed = len([snapshot for snapshot in sc.SNAPSHOT_COLLECTION if snapshot["file"]])
28-
failed = len([snapshot for snapshot in sc.SNAPSHOT_COLLECTION if not snapshot["file"]])
26+
successed = len([snapshot for snapshot in sc.SNAPSHOT_COLLECTION if "file" in snapshot and snapshot["file"]])
27+
failed = len([snapshot for snapshot in sc.SNAPSHOT_COLLECTION if "file" in snapshot and not snapshot["file"]])
2928
print(f"\nFiles downloaded: {successed}")
3029
print(f"Files missing: {failed}")
3130
print("")
@@ -39,7 +38,7 @@ def write(cls, message: str = None, progress: int = None):
3938
print("")
4039
maxval = sc.count_list()
4140
cls.pbar = tqdm.tqdm(total=maxval, desc="Downloading", unit=" snapshot", ascii="░▒█")
42-
elif progress == 1:
41+
elif cls.pbar is not None and progress == 1:
4342
cls.pbar.update(1)
4443
cls.pbar.refresh()
4544
elif cls.mode == "json":

pywaybackup/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.1"
1+
__version__ = "0.8.0"

pywaybackup/archive.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def print_list():
7676
if count == 0:
7777
v.write("\nNo snapshots found")
7878
else:
79-
__import__('pprint').pprint(sc.CDX_LIST)
79+
__import__('pprint').pprint(sc.SNAPSHOT_COLLECTION)
8080
v.write(f"\n-----> {count} snapshots listed")
8181

8282

@@ -95,10 +95,9 @@ def query_list(url: str, range: int, start: int, end: int, explicit: bool, mode:
9595
else:
9696
query_range = "&from=" + str(datetime.now().year - range)
9797
cdx_url = f"*.{url}/*" if not explicit else f"{url}"
98-
cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url={cdx_url}{query_range}&fl=timestamp,original,statuscode,mimetype,digest&filter!=statuscode:200"
98+
cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url={cdx_url}{query_range}&fl=timestamp,digest,mimetype,statuscode,original&filter!=statuscode:200"
9999
cdxResult = requests.get(cdxQuery)
100-
sc.create_list_full(cdxResult)
101-
sc.create_list_current() if mode == "current" else None
100+
sc.create_list(cdxResult, mode)
102101
v.write(f"\n-----> {sc.count_list()} snapshots found")
103102
except requests.exceptions.ConnectionError as e:
104103
v.write(f"\n-----> ERROR: could not query snapshots:\n{e}"); exit()
@@ -195,7 +194,6 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
195194
f" -> URL: {location}"
196195
location = urljoin(download_url, location)
197196
download_url = location
198-
sc.snapshot_entry_modify(snapshot_entry, "redirect", True)
199197
sc.snapshot_entry_modify(snapshot_entry, "redirect_timestamp", sc.url_get_timestamp(location))
200198
sc.snapshot_entry_modify(snapshot_entry, "redirect_url", location)
201199
else:
@@ -256,4 +254,18 @@ def parse_response_code(response_code: int):
256254
"""
257255
if response_code in RESPONSE_CODE_DICT:
258256
return RESPONSE_CODE_DICT[response_code]
259-
return "Unknown response code"
257+
return "Unknown response code"
258+
259+
def save_csv(output: str):
260+
"""
261+
Write a CSV file with the list of snapshots.
262+
"""
263+
import csv
264+
if sc.count_list() > 0:
265+
v.write("\nSaving CSV file...")
266+
os.makedirs(os.path.abspath(output), exist_ok=True)
267+
with open(os.path.join(output, "waybackup.csv"), mode='w') as file:
268+
row = csv.DictWriter(file, sc.SNAPSHOT_COLLECTION[0].keys())
269+
row.writeheader()
270+
for snapshot in sc.SNAPSHOT_COLLECTION:
271+
row.writerow(snapshot)

pywaybackup/arguments.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def parse():
2222
optional.add_argument('--end', type=int, help='End timestamp format: YYYYMMDDhhmmss')
2323

2424
special = parser.add_argument_group('special')
25+
special.add_argument('--csv', action='store_true', help='Save a csv file with the list of snapshots inside the output folder')
2526
special.add_argument('--no-redirect', action='store_true', help='Do not follow redirects by archive.org')
2627
special.add_argument('--verbosity', type=str, default="standard", choices=["standard", "progress", "json"], help='Verbosity level')
2728
special.add_argument('--retry', type=int, default=0, metavar="X-TIMES", help='Retry failed downloads (opt tries as int, else infinite)')

pywaybackup/main.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import pywaybackup.archive as archive
2-
import pywaybackup.SnapshotCollection as sc
32
import os
43

54
from pywaybackup.arguments import parse
@@ -9,22 +8,24 @@ def main():
98
args = parse()
109
v.open(args.verbosity)
1110

11+
if args.full:
12+
mode = "full"
1213
if args.current:
1314
mode = "current"
14-
elif args.full:
15-
mode = "full"
15+
16+
if args.output is None:
17+
args.output = os.path.join(os.getcwd(), "waybackup_snapshots")
1618

1719
if args.save:
1820
archive.save_page(args.url)
1921
else:
20-
if args.output is None:
21-
args.output = os.path.join(os.getcwd(), "waybackup_snapshots")
2222
archive.query_list(args.url, args.range, args.start, args.end, args.explicit, mode)
2323
if args.list:
2424
archive.print_list()
2525
else:
26-
archive.download_list(args.output, args.retry, args.no_redirect, args.worker)
27-
#archive.remove_empty_folders(args.output)
26+
archive.download_list(args.output, args.retry, args.no_redirect, args.worker)
27+
if args.csv:
28+
archive.save_csv(args.output)
2829
v.close()
2930

3031
if __name__ == "__main__":

0 commit comments

Comments
 (0)