Skip to content

Commit 9b422bb

Browse files
committed
Merge branch 'f/progress-bar'
1 parent a1878f6 commit 9b422bb

File tree

6 files changed

+188
-124
lines changed

6 files changed

+188
-124
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ This script allows you to download content from the Wayback Machine (archive.org
5454

5555
#### Additional
5656

57+
- `--verbosity [LEVEL]`: Set the verbosity: json (print json response), progress (show progress bar) or standard (default).
5758
- `--retry [RETRY_FAILED]`: Retry failed downloads. You can specify the number of retry attempts as an integer.
5859
- `--worker [AMOUNT]`: The number of worker to use for downloading (simultaneous downloads). Default is 1. Beware: Using too many worker will lead into refused connections from the Wayback Machine. Duration about 1.5 minutes.
5960

pywaybackup/SnapshotCollection.py

Lines changed: 45 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -3,67 +3,65 @@
33

44
class SnapshotCollection:
55

6-
CDX_RESULT_JSON = []
7-
CDX_RESULT_LIST = []
8-
CDX_RESULT_COLLECTION = []
6+
CDX_JSON = []
7+
CDX_LIST = []
8+
9+
SNAPSHOT_COLLECTION = []
910

1011
MODE_CURRENT = 0
1112

12-
def __init__(self, cdxResult=None, cdxCollection=None):
13-
if cdxResult:
14-
self.CDX_RESULT_JSON = cdxResult.json()[1:]
15-
self.CDX_RESULT_LIST = [{"timestamp": snapshot[0], "url": snapshot[1]} for snapshot in self.CDX_RESULT_JSON]
16-
self.CDX_RESULT_LIST = sorted(self.CDX_RESULT_LIST, key=lambda k: k['timestamp'], reverse=True)
17-
if cdxCollection:
18-
self.CDX_RESULT_COLLECTION = cdxCollection
13+
def __init__(self):
14+
pass
15+
16+
def create_full(self, cdxResult):
17+
self.CDX_JSON = cdxResult.json()[1:]
18+
self.CDX_LIST = [{"id": i, "timestamp": snapshot[0], "url": snapshot[1]} for i, snapshot in enumerate(self.CDX_JSON)]
19+
self.CDX_LIST = sorted(self.CDX_LIST, key=lambda k: k['timestamp'], reverse=True)
1920

2021
def create_current(self):
2122
self.MODE_CURRENT = 1
22-
self.CDX_RESULT_LIST = sorted(self.CDX_RESULT_LIST, key=lambda k: k['timestamp'], reverse=True)
23+
self.CDX_LIST = sorted(self.CDX_LIST, key=lambda k: k['timestamp'], reverse=True)
2324
cdxResult_list_filtered = []
24-
for snapshot in self.CDX_RESULT_LIST:
25-
if snapshot["url"] not in [snapshot["url"] for snapshot in cdxResult_list_filtered]:
25+
url_set = set()
26+
for snapshot in self.CDX_LIST:
27+
if snapshot["url"] not in url_set:
2628
cdxResult_list_filtered.append(snapshot)
27-
self.CDX_RESULT_LIST = cdxResult_list_filtered
29+
url_set.add(snapshot["url"])
30+
self.CDX_LIST = cdxResult_list_filtered
2831

29-
def create_collection(self, output):
30-
for snapshot in self.CDX_RESULT_LIST:
31-
timestamp, url = snapshot["timestamp"], snapshot["url"]
32-
url_type = self.__get_url_filetype(url)
33-
download_url = f"http://web.archive.org/web/{timestamp}{url_type}/{url}"
34-
domain, subdir, filename = self.__split_url(url)
35-
if self.MODE_CURRENT: download_dir = os.path.join(output, domain, subdir)
36-
else: download_dir = os.path.join(output, domain, timestamp, subdir)
37-
download_file = os.path.join(download_dir, filename)
38-
self.CDX_RESULT_COLLECTION.append(
39-
{
40-
"index": self.CDX_RESULT_LIST.index(snapshot),
41-
"url": download_url,
42-
"file": str(download_file),
43-
"success": False,
44-
"retry": 0
45-
}
46-
)
32+
def create_entry(self, cdx_entry: dict, output: str) -> dict:
33+
timestamp, url = cdx_entry["timestamp"], cdx_entry["url"]
34+
url_type = self.__get_url_filetype(url)
35+
download_url = f"http://web.archive.org/web/{timestamp}{url_type}/{url}"
36+
domain, subdir, filename = self.__split_url(url)
37+
if self.MODE_CURRENT: download_dir = os.path.join(output, domain, subdir)
38+
else: download_dir = os.path.join(output, domain, timestamp, subdir)
39+
download_file = os.path.join(download_dir, filename)
40+
cdx_entry = {
41+
"id": cdx_entry["id"],
42+
"url": download_url,
43+
"file": download_file,
44+
"timestamp": timestamp,
45+
"origin_url": url,
46+
"success": False,
47+
"retry": 0
48+
}
49+
return cdx_entry
4750

4851
def count_list(self):
49-
return len(self.CDX_RESULT_LIST)
52+
return len(self.CDX_LIST)
5053

51-
def count_collection(self):
52-
return len(self.CDX_RESULT_COLLECTION)
54+
def snapshot_collection_write(self, query_entry: dict):
55+
if query_entry["id"] not in self.SNAPSHOT_COLLECTION:
56+
self.SNAPSHOT_COLLECTION.append(query_entry)
5357

54-
def set_value(self, index: int, key: str, value: str):
55-
"""
56-
Set a value in the collection
58+
def snapshot_collection_update(self, id: int, key: str, value: str):
59+
index = next((index for (index, d) in enumerate(self.SNAPSHOT_COLLECTION) if d["id"] == id), None)
60+
if index is not None:
61+
self.SNAPSHOT_COLLECTION[index][key] = value
5762

58-
Args:
59-
index (int): Index of the snapshot
60-
key (str): Key of the value
61-
value (str): Value to set
62-
"""
63-
self.CDX_RESULT_COLLECTION[index][key] = value
64-
6563
def __get_url_filetype(self, url):
66-
file_extension = url.split(".")[-1]
64+
file_extension = os.path.splitext(url)[1][1:]
6765
urltype_mapping = {
6866
"jpg": "im_",
6967
"jpeg": "im_",
@@ -80,6 +78,6 @@ def __get_url_filetype(self, url):
8078
def __split_url(self, url):
8179
parsed_url = urlparse(url)
8280
domain = parsed_url.netloc
83-
subdir = parsed_url.path.strip("/")
81+
subdir = parsed_url.path.strip("/").rsplit("/", 1)[0]
8482
filename = parsed_url.path.split("/")[-1] or "index.html"
8583
return domain, subdir, filename

pywaybackup/Verbosity.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import tqdm
2+
import json
3+
import time
4+
import pywaybackup.SnapshotCollection as sc
5+
6+
class Verbosity:
7+
8+
snapshots = None
9+
mode = None
10+
args = None
11+
pbar = None
12+
13+
@classmethod
14+
def open(cls, args: list, snapshots: sc.SnapshotCollection):
15+
cls.snapshots = snapshots
16+
cls.args = args
17+
if cls.args == "progress":
18+
cls.mode = "progress"
19+
elif cls.args == "json":
20+
cls.mode = "json"
21+
else:
22+
cls.mode = "standard"
23+
24+
@classmethod
25+
def close(cls):
26+
if cls.mode == "json":
27+
print(json.dumps(cls.snapshots.SNAPSHOT_COLLECTION, indent=4, sort_keys=True))
28+
elif cls.mode == "standard":
29+
print("")
30+
31+
@classmethod
32+
def write(cls, message: str = None, progress: int = None):
33+
if cls.mode == "progress":
34+
if progress == 0:
35+
maxval = cls.snapshots.count_list()
36+
cls.pbar = tqdm.tqdm(total=maxval, desc="Downloading", unit=" snapshot", ascii="░▒█")
37+
elif progress == 1:
38+
cls.pbar.update(1)
39+
cls.pbar.refresh()
40+
elif cls.mode == "json":
41+
pass
42+
else:
43+
if message:
44+
print(message)

pywaybackup/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.2"
1+
__version__ = "0.6.0"

0 commit comments

Comments
 (0)