Skip to content

Commit ecab7ec

Browse files
committed
Merge branch 'f/only-explicit-url'
1 parent 6ed5198 commit ecab7ec

File tree

4 files changed

+10
-5
lines changed

4 files changed

+10
-5
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ This script allows you to download content from the Wayback Machine (archive.org
4949
#### Optional Arguments
5050

5151
- `-l`, `--list`: Only print the snapshots available within the specified range. Does not download the snapshots.
52+
- `-e`, `--explicit`: Only download the explicit given url. No wildcard subdomains or paths.
5253
- `-r RANGE`, `--range RANGE`: Specify the range in years for which to search and download snapshots.
5354
- `-o OUTPUT`, `--output OUTPUT`: The folder where downloaded files will be saved.
5455

pywaybackup/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.1"
1+
__version__ = "0.6.2"

pywaybackup/archive.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,17 +83,17 @@ def print_list(snapshots):
8383

8484

8585
# create filelist
86-
def query_list(snapshots: sc.SnapshotCollection, url: str, range: int, mode: str):
86+
def query_list(snapshots: sc.SnapshotCollection, url: str, range: int, explicit: bool, mode: str):
8787
try:
8888
v.write("\nQuerying snapshots...")
8989
if range:
9090
range = datetime.now().year - range
9191
range = "&from=" + str(range)
9292
else:
9393
range = ""
94-
cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url=*.{url}/*{range}&fl=timestamp,original&filter=!statuscode:200"
94+
cdx_url = f"*.{url}/*" if not explicit else f"{url}"
95+
cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url={cdx_url}{range}&fl=timestamp,original&filter=!statuscode:200"
9596
cdxResult = requests.get(cdxQuery)
96-
if cdxResult.status_code != 200: v.write(f"\n-----> ERROR: could not query snapshots, status code: {cdxResult.status_code}"); exit()
9797
snapshots.create_full(cdxResult)
9898
if mode == "current": snapshots.create_current()
9999
v.write(f"\n-----> {snapshots.count_list()} snapshots found")
@@ -142,6 +142,9 @@ def download_list(snapshots, output, retry, worker):
142142
"""
143143
Download a list of urls in format: [{"timestamp": "20190815104545", "url": "https://www.google.com/"}]
144144
"""
145+
if snapshots.count_list() == 0:
146+
v.write("\nNo snapshots found to download")
147+
return
145148
v.write("\nDownloading latest snapshots of each file...", progress=0)
146149
download_list = snapshots.CDX_LIST
147150
if worker > 1:

pywaybackup/main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def main():
1818
exclusive_required.add_argument('-s', '--save', action='store_true', help='Save a page to the wayback machine')
1919
optional = parser.add_argument_group('optional')
2020
optional.add_argument('-l', '--list', action='store_true', help='Only print snapshots (opt range in y)')
21+
optional.add_argument('-e', '--explicit', action='store_true', help='Search only for the explicit given url')
2122
optional.add_argument('-r', '--range', type=int, help='Range in years to search')
2223
optional.add_argument('-o', '--output', type=str, help='Output folder')
2324
optional.add_argument('-v', '--verbosity', type=str, default="standard", choices=["standard", "progress", "json"], help='Verbosity level')
@@ -39,7 +40,7 @@ def main():
3940
else:
4041
if args.output is None:
4142
args.output = os.path.join(os.getcwd(), "waybackup_snapshots")
42-
archive.query_list(snapshots, args.url, args.range, mode)
43+
archive.query_list(snapshots, args.url, args.range, args.explicit, mode)
4344
if args.list:
4445
archive.print_list(snapshots)
4546
else:

0 commit comments

Comments
 (0)