Merge branch 'f/only-explicit-url'

bitdruid · bitdruid · commit ecab7ecd0d90 · 2024-03-10T22:02:09.000+01:00
diff --git a/README.md b/README.md
@@ -49,6 +49,7 @@ This script allows you to download content from the Wayback Machine (archive.org
 #### Optional Arguments
 
 - `-l`, `--list`: Only print the snapshots available within the specified range. Does not download the snapshots.
+- `-e`, `--explicit`: Only download the explicit given url. No wildcard subdomains or paths.
 - `-r RANGE`, `--range RANGE`: Specify the range in years for which to search and download snapshots.
 - `-o OUTPUT`, `--output OUTPUT`: The folder where downloaded files will be saved.
 
diff --git a/pywaybackup/__version__.py b/pywaybackup/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.1"
+__version__ = "0.6.2"
diff --git a/pywaybackup/archive.py b/pywaybackup/archive.py
@@ -83,17 +83,17 @@ def print_list(snapshots):
 
 
 # create filelist
-def query_list(snapshots: sc.SnapshotCollection, url: str, range: int, mode: str):
+def query_list(snapshots: sc.SnapshotCollection, url: str, range: int, explicit: bool, mode: str):
     try:
         v.write("\nQuerying snapshots...")
         if range:
             range = datetime.now().year - range
             range = "&from=" + str(range)
         else:
             range = ""
-        cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url=*.{url}/*{range}&fl=timestamp,original&filter=!statuscode:200"
+        cdx_url = f"*.{url}/*" if not explicit else f"{url}"
+        cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url={cdx_url}{range}&fl=timestamp,original&filter=!statuscode:200"
         cdxResult = requests.get(cdxQuery)
-        if cdxResult.status_code != 200: v.write(f"\n-----> ERROR: could not query snapshots, status code: {cdxResult.status_code}"); exit()
         snapshots.create_full(cdxResult)
         if mode == "current": snapshots.create_current()
         v.write(f"\n-----> {snapshots.count_list()} snapshots found")
@@ -142,6 +142,9 @@ def download_list(snapshots, output, retry, worker):
     """
     Download a list of urls in format: [{"timestamp": "20190815104545", "url": "https://www.google.com/"}]
     """
+    if snapshots.count_list() == 0: 
+        v.write("\nNo snapshots found to download")
+        return
     v.write("\nDownloading latest snapshots of each file...", progress=0)
     download_list = snapshots.CDX_LIST
     if worker > 1:
diff --git a/pywaybackup/main.py b/pywaybackup/main.py
@@ -18,6 +18,7 @@ def main():
     exclusive_required.add_argument('-s', '--save', action='store_true', help='Save a page to the wayback machine')
     optional = parser.add_argument_group('optional')
     optional.add_argument('-l', '--list', action='store_true', help='Only print snapshots (opt range in y)')
+    optional.add_argument('-e', '--explicit', action='store_true', help='Search only for the explicit given url')
     optional.add_argument('-r', '--range', type=int, help='Range in years to search')
     optional.add_argument('-o', '--output', type=str, help='Output folder')
     optional.add_argument('-v', '--verbosity', type=str, default="standard", choices=["standard", "progress", "json"], help='Verbosity level')
@@ -39,7 +40,7 @@ def main():
     else:
         if args.output is None:
             args.output = os.path.join(os.getcwd(), "waybackup_snapshots")
-        archive.query_list(snapshots, args.url, args.range, mode)
+        archive.query_list(snapshots, args.url, args.range, args.explicit, mode)
         if args.list:
             archive.print_list(snapshots)
         else:

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.6.1"`
	`1`	`+__version__ = "0.6.2"`