Skip to content

Commit 4f12a31

Browse files
committed
Merge branch 'f/code-enhancement'
1 parent 7a17533 commit 4f12a31

File tree

7 files changed

+205
-211
lines changed

7 files changed

+205
-211
lines changed

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ Internet-archive is a nice source for several OSINT-information. This script is
3131

3232
This script allows you to download content from the Wayback Machine (archive.org). You can use it to download either the latest version or all versions of web page snapshots within a specified range.
3333

34+
## Info
35+
36+
- The script will only request status code 200 snapshots (for now) - but this can differ from the status code when downloading the file.
37+
3438
### Arguments
3539

3640
- `-h`, `--help`: Show the help message and exit.
@@ -42,8 +46,8 @@ This script allows you to download content from the Wayback Machine (archive.org
4246

4347
#### Mode Selection (Choose One)
4448

45-
- `-c`, `--current`: Download the latest version of each file snapshot.
46-
- `-f`, `--full`: Download snapshots of all timestamps.
49+
- `-c`, `--current`: Download the latest version of each file snapshot. You will get a rebuild of the current website with all available files.
50+
- `-f`, `--full`: Download snapshots of all timestamps. You will get a folder per timestamp with the files available at that time.
4751
- `-s`, `--save`: Save a page to the Wayback Machine. (beta)
4852

4953
#### Optional Arguments

pywaybackup/SnapshotCollection.py

Lines changed: 71 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -3,68 +3,89 @@
33

44
class SnapshotCollection:
55

6-
CDX_JSON = []
76
CDX_LIST = []
8-
97
SNAPSHOT_COLLECTION = []
10-
118
MODE_CURRENT = 0
129

13-
def __init__(self):
14-
pass
15-
16-
def create_full(self, cdxResult):
17-
self.CDX_JSON = cdxResult.json()[1:]
18-
self.CDX_LIST = [{"timestamp": snapshot[0], "url": snapshot[1]} for i, snapshot in enumerate(self.CDX_JSON)]
19-
self.CDX_LIST = sorted(self.CDX_LIST, key=lambda k: k['timestamp'], reverse=True)
10+
@classmethod
11+
def create_list_full(cls, cdxResult):
12+
cls.CDX_LIST = sorted([{"timestamp": snapshot[0], "url": snapshot[1]} for i, snapshot in enumerate(cdxResult.json()[1:])], key=lambda k: k['timestamp'], reverse=True)
2013

21-
def create_current(self):
22-
self.MODE_CURRENT = 1
23-
self.CDX_LIST = sorted(self.CDX_LIST, key=lambda k: k['timestamp'], reverse=True)
14+
@classmethod
15+
def create_list_current(cls):
16+
cls.MODE_CURRENT = 1
2417
cdxResult_list_filtered = []
2518
url_set = set()
26-
for snapshot in self.CDX_LIST:
19+
for snapshot in cls.CDX_LIST:
2720
if snapshot["url"] not in url_set:
2821
cdxResult_list_filtered.append(snapshot)
2922
url_set.add(snapshot["url"])
30-
self.CDX_LIST = cdxResult_list_filtered
23+
cls.CDX_LIST = cdxResult_list_filtered
3124

32-
def create_entry(self, cdx_entry: dict, output: str) -> dict:
33-
timestamp, url = cdx_entry["timestamp"], cdx_entry["url"]
34-
domain, subdir, filename = self.split_url(url)
35-
if self.MODE_CURRENT: download_dir = os.path.join(output, domain, subdir)
36-
else: download_dir = os.path.join(output, domain, timestamp, subdir)
37-
download_file = os.path.join(download_dir, filename)
38-
cdx_entry = {
39-
"id": len(self.SNAPSHOT_COLLECTION),
40-
"url": self.create_archive_url(timestamp, url),
41-
"file": download_file,
25+
@classmethod
26+
def count_list(cls):
27+
return len(cls.CDX_LIST)
28+
29+
@classmethod
30+
def create_collection(cls):
31+
for cdx_entry in cls.CDX_LIST:
32+
timestamp, url = cdx_entry["timestamp"], cdx_entry["url"]
33+
url_archive = f"http://web.archive.org/web/{timestamp}{cls._url_get_filetype(url)}/{url}"
34+
collection_entry = {
35+
"id": len(cls.SNAPSHOT_COLLECTION),
4236
"timestamp": timestamp,
43-
"origin_url": url,
44-
"success": False,
45-
"retry": 0
37+
"url_archive": url_archive,
38+
"url_origin": url,
39+
"file": False,
40+
"redirect": False,
41+
"http_code": False,
42+
"http_message": False,
43+
"retry": False
4644
}
47-
return cdx_entry
45+
cls.SNAPSHOT_COLLECTION.append(collection_entry)
46+
47+
@classmethod
48+
def snapshot_entry_create_output(cls, collection_entry: dict, output: str) -> str:
49+
"""
50+
Create the output path for a snapshot entry of the collection according to the mode.
4851
49-
@classmethod
50-
def create_archive_url(cls, timestamp: str, url: str) -> str:
51-
url_type = cls.__get_url_filetype(url)
52-
return f"http://web.archive.org/web/{timestamp}{url_type}/{url}"
52+
Input:
53+
- collection_entry: A single snapshot entry of the collection (dict).
54+
- output: The output directory (str).
5355
54-
def count_list(self):
55-
return len(self.CDX_LIST)
56-
57-
def snapshot_collection_write(self, query_entry: dict):
58-
if query_entry["id"] not in self.SNAPSHOT_COLLECTION:
59-
self.SNAPSHOT_COLLECTION.append(query_entry)
60-
61-
def snapshot_collection_update(self, id: int, key: str, value: str):
62-
index = next((index for (index, d) in enumerate(self.SNAPSHOT_COLLECTION) if d["id"] == id), None)
63-
if index is not None:
64-
self.SNAPSHOT_COLLECTION[index][key] = value
56+
Output:
57+
- download_file: The output path for the snapshot entry (str) with filename.
58+
"""
59+
timestamp, url = collection_entry["timestamp"], collection_entry["url_origin"]
60+
domain, subdir, filename = cls._url_split(url)
61+
if cls.MODE_CURRENT:
62+
download_dir = os.path.join(output, domain, subdir)
63+
else:
64+
download_dir = os.path.join(output, domain, timestamp, subdir)
65+
download_file = os.path.join(download_dir, filename)
66+
return download_file
67+
68+
@classmethod
69+
def snapshot_entry_modify(cls, collection_entry: dict, key: str, value: str):
70+
"""
71+
Modify a key-value pair in a snapshot entry of the collection (dict).
72+
73+
- Append a new key-value pair if the key does not exist.
74+
- Modify an existing key-value pair if the key exists.
75+
"""
76+
collection_entry[key] = value
6577

6678
@classmethod
67-
def get_url_filetype(cls, url):
79+
def url_get_timestamp(cls, url):
80+
"""
81+
Extract the timestamp from a wayback machine URL.
82+
"""
83+
import re
84+
timestamp = re.search(r'web.archive.org/web/(\d+)/', url).group(1)
85+
return timestamp
86+
87+
@classmethod
88+
def _url_get_filetype(cls, url):
6889
file_extension = os.path.splitext(url)[1][1:]
6990
urltype_mapping = {
7091
"jpg": "im_",
@@ -79,8 +100,11 @@ def get_url_filetype(cls, url):
79100
urltype = urltype_mapping.get(file_extension, "id_")
80101
return urltype
81102

82-
@staticmethod
83-
def split_url(url):
103+
@classmethod
104+
def _url_split(cls, url):
105+
"""
106+
Split a URL into domain, subdir and filename.
107+
"""
84108
parsed_url = urlparse(url)
85109
domain = parsed_url.netloc
86110
subdir = parsed_url.path.strip("/").rsplit("/", 1)[0]

pywaybackup/Verbosity.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import tqdm
22
import json
3-
import pywaybackup.SnapshotCollection as sc
3+
from pywaybackup.SnapshotCollection import SnapshotCollection as sc
44

55
class Verbosity:
66

@@ -10,8 +10,7 @@ class Verbosity:
1010
pbar = None
1111

1212
@classmethod
13-
def open(cls, args: list, snapshots: sc.SnapshotCollection):
14-
cls.snapshots = snapshots
13+
def open(cls, args: list):
1514
cls.args = args
1615
if cls.args == "progress":
1716
cls.mode = "progress"
@@ -25,20 +24,20 @@ def close(cls):
2524
if cls.mode == "progress":
2625
cls.pbar.close()
2726
if cls.mode == "progress" or cls.mode == "standard":
28-
successed = len([snapshot for snapshot in cls.snapshots.SNAPSHOT_COLLECTION if snapshot["success"]])
29-
failed = len([snapshot for snapshot in cls.snapshots.SNAPSHOT_COLLECTION if not snapshot["success"]])
30-
print(f"\nSuccessed downloads: {successed}")
31-
print(f"Failed downloads: {failed}")
27+
successed = len([snapshot for snapshot in sc.SNAPSHOT_COLLECTION if snapshot["file"]])
28+
failed = len([snapshot for snapshot in sc.SNAPSHOT_COLLECTION if not snapshot["file"]])
29+
print(f"\nFiles downloaded: {successed}")
30+
print(f"Files missing: {failed}")
3231
print("")
3332
if cls.mode == "json":
34-
print(json.dumps(cls.snapshots.SNAPSHOT_COLLECTION, indent=4, sort_keys=True))
33+
print(json.dumps(sc.SNAPSHOT_COLLECTION, indent=4, sort_keys=True))
3534

3635
@classmethod
3736
def write(cls, message: str = None, progress: int = None):
3837
if cls.mode == "progress":
3938
if progress == 0:
4039
print("")
41-
maxval = cls.snapshots.count_list()
40+
maxval = sc.count_list()
4241
cls.pbar = tqdm.tqdm(total=maxval, desc="Downloading", unit=" snapshot", ascii="░▒█")
4342
elif progress == 1:
4443
cls.pbar.update(1)

pywaybackup/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.4"
1+
__version__ = "0.7.0"

0 commit comments

Comments
 (0)