Skip to content

Commit 7a17533

Browse files
committed
Merge branch 'r/beta'
1 parent 6bb6076 commit 7a17533

File tree

7 files changed

+138
-96
lines changed

7 files changed

+138
-96
lines changed

README.md

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
[![PyPI](https://img.shields.io/pypi/v/pywaybackup)](https://pypi.org/project/pywaybackup/)
44
[![PyPI - Downloads](https://img.shields.io/pypi/dm/pywaybackup)](https://pypi.org/project/pywaybackup/)
5-
![Release](https://img.shields.io/badge/Release-alpha-red)
5+
![Release](https://img.shields.io/badge/Release-beta-orange)
66
![Python Version](https://img.shields.io/badge/Python-3.6-blue)
77
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
88

@@ -61,6 +61,8 @@ Specify the range in years or a specific timestamp either start, end or both. If
6161

6262
#### Additional
6363

64+
- `--redirect`: Follow redirects of snapshots. Default is False. If a source has not statuscode 200, archive.org will redirect to the closest snapshot. So when setting this to `true`, parts of a timestamp-folder may not truly belong to the given timestamp.
65+
<!-- - `--harvest`: The downloaded files are scanned for locations on the same domain. These locations (mostly resources) are then tried to be accessed within the same timestamp. Setting this to `true` may result in identical files in different timestamps but you may get a more complete snapshot of the website. -->
6466
- `--verbosity [LEVEL]`: Set the verbosity: json (print json response), progress (show progress bar) or standard (default).
6567
- `--retry [RETRY_FAILED]`: Retry failed downloads. You can specify the number of retry attempts as an integer.
6668
- `--worker [AMOUNT]`: The number of worker to use for downloading (simultaneous downloads). Default is 1. Beware: Using too many worker will lead into refused connections from the Wayback Machine. Duration about 1.5 minutes.
@@ -73,19 +75,15 @@ Download latest snapshot of all files:<br>
7375
Download latest snapshot of all files with retries:<br>
7476
`waybackup -u http://example.com -c --retry 3`
7577

76-
Download all snapshots sorted per timestamp with a specified range:<br>
77-
`waybackup -u http://example.com -f -r 5`
78+
Download all snapshots sorted per timestamp with a specified range and follow redirects:<br>
79+
`waybackup -u http://example.com -f -r 5 --redirect`
7880

7981
Download all snapshots sorted per timestamp with a specified range and save to a specified folder with 3 worker:<br>
8082
`waybackup -u http://example.com -f -r 5 -o /home/user/Downloads/snapshots --worker 3`
8183

8284
List available snapshots per timestamp without downloading:<br>
8385
`waybackup -u http://example.com -f -l`
8486

85-
## Info
86-
87-
The script automatically follows redirects of snapshots.
88-
8987
## Contributing
9088

9189
I'm always happy for some feature requests to improve the usability of this script.

pywaybackup/SnapshotCollection.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,13 @@ def create_current(self):
3131

3232
def create_entry(self, cdx_entry: dict, output: str) -> dict:
3333
timestamp, url = cdx_entry["timestamp"], cdx_entry["url"]
34-
url_type = self.__get_url_filetype(url)
35-
download_url = f"http://web.archive.org/web/{timestamp}{url_type}/{url}"
36-
domain, subdir, filename = self.__split_url(url)
34+
domain, subdir, filename = self.split_url(url)
3735
if self.MODE_CURRENT: download_dir = os.path.join(output, domain, subdir)
3836
else: download_dir = os.path.join(output, domain, timestamp, subdir)
3937
download_file = os.path.join(download_dir, filename)
4038
cdx_entry = {
4139
"id": len(self.SNAPSHOT_COLLECTION),
42-
"url": download_url,
40+
"url": self.create_archive_url(timestamp, url),
4341
"file": download_file,
4442
"timestamp": timestamp,
4543
"origin_url": url,
@@ -48,6 +46,11 @@ def create_entry(self, cdx_entry: dict, output: str) -> dict:
4846
}
4947
return cdx_entry
5048

49+
@classmethod
50+
def create_archive_url(cls, timestamp: str, url: str) -> str:
51+
url_type = cls.__get_url_filetype(url)
52+
return f"http://web.archive.org/web/{timestamp}{url_type}/{url}"
53+
5154
def count_list(self):
5255
return len(self.CDX_LIST)
5356

@@ -60,7 +63,8 @@ def snapshot_collection_update(self, id: int, key: str, value: str):
6063
if index is not None:
6164
self.SNAPSHOT_COLLECTION[index][key] = value
6265

63-
def __get_url_filetype(self, url):
66+
@classmethod
67+
def get_url_filetype(cls, url):
6468
file_extension = os.path.splitext(url)[1][1:]
6569
urltype_mapping = {
6670
"jpg": "im_",
@@ -69,13 +73,14 @@ def __get_url_filetype(self, url):
6973
"gif": "im_",
7074
"svg": "im_",
7175
"ico": "im_",
72-
"css": "cs_",
73-
"js": "js_"
76+
"css": "cs_"
77+
#"js": "js_"
7478
}
7579
urltype = urltype_mapping.get(file_extension, "id_")
7680
return urltype
77-
78-
def __split_url(self, url):
81+
82+
@staticmethod
83+
def split_url(url):
7984
parsed_url = urlparse(url)
8085
domain = parsed_url.netloc
8186
subdir = parsed_url.path.strip("/").rsplit("/", 1)[0]

pywaybackup/Verbosity.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import tqdm
22
import json
3-
import time
43
import pywaybackup.SnapshotCollection as sc
54

65
class Verbosity:
@@ -23,15 +22,22 @@ def open(cls, args: list, snapshots: sc.SnapshotCollection):
2322

2423
@classmethod
2524
def close(cls):
25+
if cls.mode == "progress":
26+
cls.pbar.close()
27+
if cls.mode == "progress" or cls.mode == "standard":
28+
successed = len([snapshot for snapshot in cls.snapshots.SNAPSHOT_COLLECTION if snapshot["success"]])
29+
failed = len([snapshot for snapshot in cls.snapshots.SNAPSHOT_COLLECTION if not snapshot["success"]])
30+
print(f"\nSuccessed downloads: {successed}")
31+
print(f"Failed downloads: {failed}")
32+
print("")
2633
if cls.mode == "json":
2734
print(json.dumps(cls.snapshots.SNAPSHOT_COLLECTION, indent=4, sort_keys=True))
28-
elif cls.mode == "standard":
29-
print("")
3035

3136
@classmethod
3237
def write(cls, message: str = None, progress: int = None):
3338
if cls.mode == "progress":
3439
if progress == 0:
40+
print("")
3541
maxval = cls.snapshots.count_list()
3642
cls.pbar = tqdm.tqdm(total=maxval, desc="Downloading", unit=" snapshot", ascii="░▒█")
3743
elif progress == 1:

pywaybackup/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.3"
1+
__version__ = "0.6.4"

0 commit comments

Comments
 (0)