Merge branch 'f/csv-output'

bitdruid · bitdruid · commit cff8c6d953c2 · 2024-04-08T16:34:18.000+02:00
diff --git a/README.md b/README.md
@@ -63,10 +63,11 @@ Specify the range in years or a specific timestamp either start, end or both. If
 
 #### Additional
 
+- `--csv`: Save a csv file with the list of snapshots inside the output folder.
 - `--no-redirect`: Do not follow redirects of snapshots. Archive.org sometimes redirects to a different snapshot for several reasons. Downloading redirects may lead to timestamp-folders which contain some files with a different timestamp. This does not matter if you only want to download the latest version (`-c`).
 - `--verbosity [LEVEL]`: Set the verbosity: json (print json response), progress (show progress bar) or standard (default).
 - `--retry [RETRY_FAILED]`: Retry failed downloads. You can specify the number of retry attempts as an integer.
-- `--worker [AMOUNT]`: The number of worker to use for downloading (simultaneous downloads). Default is 1. Beware: Using too many worker will lead into refused connections from the Wayback Machine. Duration about 1.5 minutes.
+- `--worker [AMOUNT]`: The number of worker to use for downloading (simultaneous downloads). Default is 1. A safe spot is about 10 workers. Beware: Using too many worker will lead into refused connections from the Wayback Machine. Duration about 1.5 minutes.
 
 ### Examples
 
diff --git a/pywaybackup/SnapshotCollection.py b/pywaybackup/SnapshotCollection.py
@@ -3,44 +3,54 @@
 
 class SnapshotCollection:
 
-    CDX_LIST = []
     SNAPSHOT_COLLECTION = []
     MODE_CURRENT = 0
 
     @classmethod
-    def create_list_full(cls, cdxResult):
-        cls.CDX_LIST = sorted([{"timestamp": snapshot[0], "url": snapshot[1], "status": snapshot[2], "mimetype": snapshot[3], "digest": snapshot[4]} for i, snapshot in enumerate(cdxResult.json()[1:])], key=lambda k: k['timestamp'], reverse=True)
-
-    @classmethod
-    def create_list_current(cls):
-        cls.MODE_CURRENT = 1
-        cdxResult_list_filtered = []
-        url_set = set()
-        for snapshot in cls.CDX_LIST:
-            if snapshot["url"] not in url_set:
-                cdxResult_list_filtered.append(snapshot)
-                url_set.add(snapshot["url"])
-        cls.CDX_LIST = cdxResult_list_filtered
+    def create_list(cls, cdxResult, mode):
+        """
+        Create the snapshot collection list from a cdx result.
 
+        - mode `full`: All snapshots are included.
+        - mode `current`: Only the latest snapshot of each file is included.
+        """
+        # creates a list of dictionaries for each snapshot entry
+        cls.SNAPSHOT_COLLECTION = sorted([{"timestamp": snapshot[0], "digest": snapshot[1], "mimetype": snapshot[2], "status": snapshot[3], "url": snapshot[4]} for snapshot in cdxResult.json()[1:]], key=lambda k: k['timestamp'], reverse=True)
+        if mode == "current": 
+            cls.MODE_CURRENT = 1
+            cdxResult_list_filtered = []
+            url_set = set()
+            # filters the list to only include the latest snapshot of each file
+            for snapshot in cls.SNAPSHOT_COLLECTION:
+                if snapshot["url"] not in url_set:
+                    cdxResult_list_filtered.append(snapshot)
+                    url_set.add(snapshot["url"])
+            cls.SNAPSHOT_COLLECTION = cdxResult_list_filtered
+        # writes the index for each snapshot entry
+        cls.SNAPSHOT_COLLECTION = [{"id": idx, **entry} for idx, entry in enumerate(cls.SNAPSHOT_COLLECTION)]
+    
     @classmethod
     def count_list(cls):
-        return len(cls.CDX_LIST)
+        return len(cls.SNAPSHOT_COLLECTION)
 
     @classmethod
     def create_collection(cls):
-        for cdx_entry in cls.CDX_LIST:
+        new_collection = []
+        for cdx_entry in cls.SNAPSHOT_COLLECTION:
             timestamp, url = cdx_entry["timestamp"], cdx_entry["url"]
             url_archive = f"http://web.archive.org/web/{timestamp}{cls._url_get_filetype(url)}/{url}"
             collection_entry = {
-                "id": len(cls.SNAPSHOT_COLLECTION),
+                "id": cls.SNAPSHOT_COLLECTION.index(cdx_entry),
                 "timestamp": timestamp,
                 "url_archive": url_archive,
                 "url_origin": url,
-                "file": False,
-                "redirect": False,
-                "response": False
+                "redirect_url": False,
+                "redirect_timestamp": False,
+                "response": False,
+                "file": False
             }
-            cls.SNAPSHOT_COLLECTION.append(collection_entry)
+            new_collection.append(collection_entry)
+        cls.SNAPSHOT_COLLECTION = new_collection
     
     @classmethod
     def snapshot_entry_create_output(cls, collection_entry: dict, output: str) -> str:
@@ -60,7 +70,7 @@ def snapshot_entry_create_output(cls, collection_entry: dict, output: str) -> st
             download_dir = os.path.join(output, domain, subdir)
         else:
             download_dir = os.path.join(output, domain, timestamp, subdir)
-        download_file = os.path.join(download_dir, filename)
+        download_file = os.path.abspath(os.path.join(download_dir, filename))
         return download_file
 
     @classmethod
diff --git a/pywaybackup/Verbosity.py b/pywaybackup/Verbosity.py
@@ -4,7 +4,6 @@
 
 class Verbosity:
 
-    snapshots = None
     mode = None
     args = None
     pbar = None
@@ -22,10 +21,10 @@ def open(cls, args: list):
     @classmethod
     def close(cls):
         if cls.mode == "progress":
-            cls.pbar.close()
+            if cls.pbar is not None: cls.pbar.close()
         if cls.mode == "progress" or cls.mode == "standard":
-            successed = len([snapshot for snapshot in sc.SNAPSHOT_COLLECTION if snapshot["file"]])
-            failed = len([snapshot for snapshot in sc.SNAPSHOT_COLLECTION if not snapshot["file"]])
+            successed = len([snapshot for snapshot in sc.SNAPSHOT_COLLECTION if "file" in snapshot and snapshot["file"]])
+            failed = len([snapshot for snapshot in sc.SNAPSHOT_COLLECTION if "file" in snapshot and not snapshot["file"]])
             print(f"\nFiles downloaded: {successed}")
             print(f"Files missing: {failed}")
             print("")
@@ -39,7 +38,7 @@ def write(cls, message: str = None, progress: int = None):
                 print("")
                 maxval = sc.count_list()
                 cls.pbar = tqdm.tqdm(total=maxval, desc="Downloading", unit=" snapshot", ascii="░▒█")
-            elif progress == 1:
+            elif cls.pbar is not None and progress == 1:
                 cls.pbar.update(1)
                 cls.pbar.refresh()
         elif cls.mode == "json":
diff --git a/pywaybackup/__version__.py b/pywaybackup/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.1"
+__version__ = "0.8.0"
diff --git a/pywaybackup/archive.py b/pywaybackup/archive.py
@@ -76,7 +76,7 @@ def print_list():
     if count == 0:
         v.write("\nNo snapshots found")
     else:
-        __import__('pprint').pprint(sc.CDX_LIST)
+        __import__('pprint').pprint(sc.SNAPSHOT_COLLECTION)
         v.write(f"\n-----> {count} snapshots listed")
 
 
@@ -95,10 +95,9 @@ def query_list(url: str, range: int, start: int, end: int, explicit: bool, mode:
         else: 
             query_range = "&from=" + str(datetime.now().year - range)
         cdx_url = f"*.{url}/*" if not explicit else f"{url}"
-        cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url={cdx_url}{query_range}&fl=timestamp,original,statuscode,mimetype,digest&filter!=statuscode:200"
+        cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url={cdx_url}{query_range}&fl=timestamp,digest,mimetype,statuscode,original&filter!=statuscode:200"
         cdxResult = requests.get(cdxQuery)
-        sc.create_list_full(cdxResult)
-        sc.create_list_current() if mode == "current" else None
+        sc.create_list(cdxResult, mode)
         v.write(f"\n-----> {sc.count_list()} snapshots found")
     except requests.exceptions.ConnectionError as e:
         v.write(f"\n-----> ERROR: could not query snapshots:\n{e}"); exit()
@@ -195,7 +194,6 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
                                 f"           -> URL: {location}"
                             location = urljoin(download_url, location)
                             download_url = location
-                            sc.snapshot_entry_modify(snapshot_entry, "redirect", True)
                             sc.snapshot_entry_modify(snapshot_entry, "redirect_timestamp", sc.url_get_timestamp(location))
                             sc.snapshot_entry_modify(snapshot_entry, "redirect_url", location)
                         else:
@@ -256,4 +254,18 @@ def parse_response_code(response_code: int):
     """
     if response_code in RESPONSE_CODE_DICT:
         return RESPONSE_CODE_DICT[response_code]
-    return "Unknown response code"
+    return "Unknown response code"
+
+def save_csv(output: str):
+    """
+    Write a CSV file with the list of snapshots.
+    """
+    import csv
+    if sc.count_list() > 0:
+        v.write("\nSaving CSV file...")
+        os.makedirs(os.path.abspath(output), exist_ok=True)
+        with open(os.path.join(output, "waybackup.csv"), mode='w') as file:
+            row = csv.DictWriter(file, sc.SNAPSHOT_COLLECTION[0].keys())
+            row.writeheader()
+            for snapshot in sc.SNAPSHOT_COLLECTION:
+                row.writerow(snapshot)
diff --git a/pywaybackup/arguments.py b/pywaybackup/arguments.py
@@ -22,6 +22,7 @@ def parse():
     optional.add_argument('--end', type=int, help='End timestamp format: YYYYMMDDhhmmss')
 
     special = parser.add_argument_group('special')
+    special.add_argument('--csv', action='store_true', help='Save a csv file with the list of snapshots inside the output folder')
     special.add_argument('--no-redirect', action='store_true', help='Do not follow redirects by archive.org')
     special.add_argument('--verbosity', type=str, default="standard", choices=["standard", "progress", "json"], help='Verbosity level')
     special.add_argument('--retry', type=int, default=0, metavar="X-TIMES", help='Retry failed downloads (opt tries as int, else infinite)')
diff --git a/pywaybackup/main.py b/pywaybackup/main.py
@@ -1,5 +1,4 @@
 import pywaybackup.archive as archive
-import pywaybackup.SnapshotCollection as sc
 import os
 
 from pywaybackup.arguments import parse
@@ -9,22 +8,24 @@ def main():
     args = parse()
     v.open(args.verbosity)
 
+    if args.full:
+        mode = "full"
     if args.current:
         mode = "current"
-    elif args.full:
-        mode = "full"
+
+    if args.output is None:
+        args.output = os.path.join(os.getcwd(), "waybackup_snapshots")
 
     if args.save:
         archive.save_page(args.url)
     else:
-        if args.output is None:
-            args.output = os.path.join(os.getcwd(), "waybackup_snapshots")
         archive.query_list(args.url, args.range, args.start, args.end, args.explicit, mode)
         if args.list:
             archive.print_list()
         else:
-            archive.download_list(args.output, args.retry, args.no_redirect, args.worker)            
-            #archive.remove_empty_folders(args.output)
+            archive.download_list(args.output, args.retry, args.no_redirect, args.worker)
+        if args.csv:
+            archive.save_csv(args.output)
     v.close()
 
 if __name__ == "__main__":

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.1"`
	`1`	`+__version__ = "0.8.0"`