bitdruid
diff --git a/‎README.md
Lines changed: 6 additions & 2 deletions b/‎README.md
Lines changed: 6 additions & 2 deletions
diff --git a/‎pywaybackup/SnapshotCollection.py
Lines changed: 71 additions & 47 deletions b/‎pywaybackup/SnapshotCollection.py
Lines changed: 71 additions & 47 deletions
diff --git a/‎pywaybackup/Verbosity.py
Lines changed: 8 additions & 9 deletions b/‎pywaybackup/Verbosity.py
Lines changed: 8 additions & 9 deletions
diff --git a/‎pywaybackup/__version__.py
Lines changed: 1 addition & 1 deletion b/‎pywaybackup/__version__.py
Lines changed: 1 addition & 1 deletion
@@ -31,6 +31,10 @@ Internet-archive is a nice source for several OSINT-information. This script is
 
 This script allows you to download content from the Wayback Machine (archive.org). You can use it to download either the latest version or all versions of web page snapshots within a specified range.
 
+## Info
+
+- The script will only request status code 200 snapshots (for now) - but this can differ from the status code when downloading the file.
+
 ### Arguments
 
 - `-h`, `--help`: Show the help message and exit.
@@ -42,8 +46,8 @@ This script allows you to download content from the Wayback Machine (archive.org
 
 #### Mode Selection (Choose One)
 
-- `-c`, `--current`: Download the latest version of each file snapshot.
-- `-f`, `--full`: Download snapshots of all timestamps.
+- `-c`, `--current`: Download the latest version of each file snapshot. You will get a rebuild of the current website with all available files.
+- `-f`, `--full`: Download snapshots of all timestamps. You will get a folder per timestamp with the files available at that time.
 - `-s`, `--save`: Save a page to the Wayback Machine. (beta)
 
 #### Optional Arguments
 
@@ -3,68 +3,89 @@
 
 class SnapshotCollection:
 
-    CDX_JSON = []
     CDX_LIST = []
-
     SNAPSHOT_COLLECTION = []
-
     MODE_CURRENT = 0
 
-    def __init__(self):
-        pass
-
-    def create_full(self, cdxResult):
-        self.CDX_JSON = cdxResult.json()[1:]
-        self.CDX_LIST = [{"timestamp": snapshot[0], "url": snapshot[1]} for i, snapshot in enumerate(self.CDX_JSON)]
-        self.CDX_LIST = sorted(self.CDX_LIST, key=lambda k: k['timestamp'], reverse=True)
+    @classmethod
+    def create_list_full(cls, cdxResult):
+        cls.CDX_LIST = sorted([{"timestamp": snapshot[0], "url": snapshot[1]} for i, snapshot in enumerate(cdxResult.json()[1:])], key=lambda k: k['timestamp'], reverse=True)
 
-    def create_current(self):
-        self.MODE_CURRENT = 1
-        self.CDX_LIST = sorted(self.CDX_LIST, key=lambda k: k['timestamp'], reverse=True)
+    @classmethod
+    def create_list_current(cls):
+        cls.MODE_CURRENT = 1
         cdxResult_list_filtered = []
         url_set = set()
-        for snapshot in self.CDX_LIST:
+        for snapshot in cls.CDX_LIST:
             if snapshot["url"] not in url_set:
                 cdxResult_list_filtered.append(snapshot)
                 url_set.add(snapshot["url"])
-        self.CDX_LIST = cdxResult_list_filtered
+        cls.CDX_LIST = cdxResult_list_filtered
 
-    def create_entry(self, cdx_entry: dict, output: str) -> dict:
-        timestamp, url = cdx_entry["timestamp"], cdx_entry["url"]
-        domain, subdir, filename = self.split_url(url)
-        if self.MODE_CURRENT: download_dir = os.path.join(output, domain, subdir)
-        else: download_dir = os.path.join(output, domain, timestamp, subdir)
-        download_file = os.path.join(download_dir, filename)
-        cdx_entry = {
-                "id": len(self.SNAPSHOT_COLLECTION),
-                "url": self.create_archive_url(timestamp, url),
-                "file": download_file,
+    @classmethod
+    def count_list(cls):
+        return len(cls.CDX_LIST)
+
+    @classmethod
+    def create_collection(cls):
+        for cdx_entry in cls.CDX_LIST:
+            timestamp, url = cdx_entry["timestamp"], cdx_entry["url"]
+            url_archive = f"http://web.archive.org/web/{timestamp}{cls._url_get_filetype(url)}/{url}"
+            collection_entry = {
+                "id": len(cls.SNAPSHOT_COLLECTION),
                 "timestamp": timestamp,
-                "origin_url": url,
-                "success": False,
-                "retry": 0
+                "url_archive": url_archive,
+                "url_origin": url,
+                "file": False,
+                "redirect": False,
+                "http_code": False,
+                "http_message": False,
+                "retry": False
             }
-        return cdx_entry
+            cls.SNAPSHOT_COLLECTION.append(collection_entry)
+    
+    @classmethod
+    def snapshot_entry_create_output(cls, collection_entry: dict, output: str) -> str:
+        """
+        Create the output path for a snapshot entry of the collection according to the mode.
 
-    @classmethod    
-    def create_archive_url(cls, timestamp: str, url: str) -> str:
-        url_type = cls.__get_url_filetype(url)
-        return f"http://web.archive.org/web/{timestamp}{url_type}/{url}"
+        Input:
+        - collection_entry: A single snapshot entry of the collection (dict).
+        - output: The output directory (str).
 
-    def count_list(self):
-        return len(self.CDX_LIST)
-    
-    def snapshot_collection_write(self, query_entry: dict):
-        if query_entry["id"] not in self.SNAPSHOT_COLLECTION:
-            self.SNAPSHOT_COLLECTION.append(query_entry)
-    
-    def snapshot_collection_update(self, id: int, key: str, value: str):
-        index = next((index for (index, d) in enumerate(self.SNAPSHOT_COLLECTION) if d["id"] == id), None)
-        if index is not None:
-            self.SNAPSHOT_COLLECTION[index][key] = value
+        Output:
+        - download_file: The output path for the snapshot entry (str) with filename.
+        """
+        timestamp, url = collection_entry["timestamp"], collection_entry["url_origin"]
+        domain, subdir, filename = cls._url_split(url)
+        if cls.MODE_CURRENT:
+            download_dir = os.path.join(output, domain, subdir)
+        else:
+            download_dir = os.path.join(output, domain, timestamp, subdir)
+        download_file = os.path.join(download_dir, filename)
+        return download_file
+
+    @classmethod
+    def snapshot_entry_modify(cls, collection_entry: dict, key: str, value: str):
+        """
+        Modify a key-value pair in a snapshot entry of the collection (dict).
+
+        - Append a new key-value pair if the key does not exist.
+        - Modify an existing key-value pair if the key exists.
+        """
+        collection_entry[key] = value
 
     @classmethod
-    def get_url_filetype(cls, url):
+    def url_get_timestamp(cls, url):
+        """
+        Extract the timestamp from a wayback machine URL.
+        """
+        import re
+        timestamp = re.search(r'web.archive.org/web/(\d+)/', url).group(1)
+        return timestamp
+
+    @classmethod
+    def _url_get_filetype(cls, url):
         file_extension = os.path.splitext(url)[1][1:]
         urltype_mapping = {
             "jpg": "im_",
@@ -79,8 +100,11 @@ def get_url_filetype(cls, url):
         urltype = urltype_mapping.get(file_extension, "id_")
         return urltype
 
-    @staticmethod    
-    def split_url(url):
+    @classmethod
+    def _url_split(cls, url):
+        """
+        Split a URL into domain, subdir and filename.
+        """
         parsed_url = urlparse(url)
         domain = parsed_url.netloc
         subdir = parsed_url.path.strip("/").rsplit("/", 1)[0]
 
@@ -1,6 +1,6 @@
 import tqdm
 import json
-import pywaybackup.SnapshotCollection as sc
+from pywaybackup.SnapshotCollection import SnapshotCollection as sc
 
 class Verbosity:
 
@@ -10,8 +10,7 @@ class Verbosity:
     pbar = None
 
     @classmethod
-    def open(cls, args: list, snapshots: sc.SnapshotCollection):
-        cls.snapshots = snapshots
+    def open(cls, args: list):
         cls.args = args
         if cls.args == "progress":
             cls.mode = "progress"
@@ -25,20 +24,20 @@ def close(cls):
         if cls.mode == "progress":
             cls.pbar.close()
         if cls.mode == "progress" or cls.mode == "standard":
-            successed = len([snapshot for snapshot in cls.snapshots.SNAPSHOT_COLLECTION if snapshot["success"]])
-            failed = len([snapshot for snapshot in cls.snapshots.SNAPSHOT_COLLECTION if not snapshot["success"]])
-            print(f"\nSuccessed downloads: {successed}")
-            print(f"Failed downloads: {failed}")
+            successed = len([snapshot for snapshot in sc.SNAPSHOT_COLLECTION if snapshot["file"]])
+            failed = len([snapshot for snapshot in sc.SNAPSHOT_COLLECTION if not snapshot["file"]])
+            print(f"\nFiles downloaded: {successed}")
+            print(f"Files missing: {failed}")
             print("")
         if cls.mode == "json":
-            print(json.dumps(cls.snapshots.SNAPSHOT_COLLECTION, indent=4, sort_keys=True))
+            print(json.dumps(sc.SNAPSHOT_COLLECTION, indent=4, sort_keys=True))
 
     @classmethod
     def write(cls, message: str = None, progress: int = None):
         if cls.mode == "progress":
             if progress == 0:
                 print("")
-                maxval = cls.snapshots.count_list()
+                maxval = sc.count_list()
                 cls.pbar = tqdm.tqdm(total=maxval, desc="Downloading", unit=" snapshot", ascii="░▒█")
             elif progress == 1:
                 cls.pbar.update(1)
 
@@ -1 +1 @@
-__version__ = "0.6.4"
+__version__ = "0.7.0"
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.6.4"`
	`1`	`+__version__ = "0.7.0"`