Merge branch 'r/1.0.3'

bitdruid · bitdruid · commit 9ac5c53e8b78 · 2024-06-03T20:24:00.000+02:00
diff --git a/pywaybackup/SnapshotCollection.py b/pywaybackup/SnapshotCollection.py
@@ -117,9 +117,12 @@ def url_split(cls, url, index=False):
             url = "http://" + url
         parsed_url = urlparse(url)
         domain = parsed_url.netloc.split("@")[-1].split(":")[0] # split mailto: and port
-        filename = parsed_url.path.split("/")[-1]
-        if index is True and filename == "":
-            filename = "index.html"
-        subdir = parsed_url.path.strip("/").replace(filename, "").strip("/")
+        path_parts = parsed_url.path.split("/")
+        if not url.endswith("/") or "." in path_parts[-1]:
+            filename = path_parts[-1]
+            subdir = "/".join(path_parts[:-1]).strip("/")
+        else:
+            filename = "index.html" if index else ""
+            subdir = "/".join(path_parts).strip("/")
         filename = filename.replace("%20", " ") # replace url encoded spaces
-        return domain, subdir, filename
+        return domain, subdir, filename
diff --git a/pywaybackup/__version__.py b/pywaybackup/__version__.py
@@ -1 +1 @@
-__version__ = "1.0.2"
+__version__ = "1.0.3"
diff --git a/pywaybackup/archive.py b/pywaybackup/archive.py
@@ -14,6 +14,7 @@
 
 
 
+
 # GET: store page to wayback machine and response with redirect to snapshot
 # POST: store page to wayback machine and response with wayback machine status-page
 # tag_jobid = '<script>spn.watchJob("spn2-%s", "/_static/",6000);</script>'
@@ -69,7 +70,6 @@ def save_page(url: str):
 
 
 
-
 def print_list():
     v.write("")
     count = sc.count_list()
@@ -146,6 +146,10 @@ def download_list(output, retry, no_redirect, workers):
     for thread in threads:
         thread.join()
 
+
+
+
+
 def download_loop(snapshot_batch, output, worker, retry, no_redirect, attempt=1, connection=None):
     """
     Download a list of URLs in a recursive loop. If a download fails, the function will retry the download.
@@ -174,6 +178,10 @@ def download_loop(snapshot_batch, output, worker, retry, no_redirect, attempt=1,
             time.sleep(15)
         download_loop(failed_urls, output, worker, retry, no_redirect, attempt, connection)
 
+
+
+
+
 def download(output, snapshot_entry, connection, status_message, no_redirect=False):
     """
     Download a single URL and save it to the specified filepath.
@@ -202,20 +210,21 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
                         response = connection.getresponse()
                         response_data = response.read()
                         response_status = response.status
+                        response_status_message = parse_response_code(response_status)
                         location = response.getheader("Location")
                         if location:
-                            status_message = f"{status_message}\n" + \
-                                f"           -> URL: {location}"
                             location = urljoin(download_url, location)
                             download_url = location
+                            status_message = f"{status_message}\n" + \
+                                f"           -> URL: {download_url}"
                             sc.snapshot_entry_modify(snapshot_entry, "redirect_timestamp", sc.url_get_timestamp(location))
                             sc.snapshot_entry_modify(snapshot_entry, "redirect_url", location)
                         else:
                             break
             if response_status == 200:
-                sc.snapshot_entry_modify(snapshot_entry, "file", sc.snapshot_entry_create_output(snapshot_entry, output))
-                download_file = snapshot_entry["file"]
-                os.makedirs(os.path.dirname(download_file), exist_ok=True)
+                download_file = sc.snapshot_entry_create_output(snapshot_entry, output)
+                download_path = os.path.dirname(download_file)
+                os.makedirs(download_path, exist_ok=True)
                 with open(download_file, 'wb') as file:
                     if response.getheader('Content-Encoding') == 'gzip':
                         response_data = gzip.decompress(response_data)
@@ -227,12 +236,13 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
                         f"SUCCESS    -> HTTP: {response_status} - {response_status_message}\n" + \
                         f"           -> URL: {download_url}\n" + \
                         f"           -> FILE: {download_file}"
+                    sc.snapshot_entry_modify(snapshot_entry, "file", download_file)
                 v.write(status_message)
                 return True
             else:
                 status_message = f"{status_message}\n" + \
                     f"UNEXPECTED -> HTTP: {response_status} - {response_status_message}\n" + \
-                    f"           -> URL: {download_url}\n"
+                    f"           -> URL: {download_url}"
                 v.write(status_message)
                 return True
         # exception returns false and appends the url to the failed list
@@ -262,6 +272,10 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
     500: "Internal Server Error",
     503: "Service Unavailable"
 }
+
+
+
+
 def parse_response_code(response_code: int):
     """
     Parse the response code of the Wayback Machine and return a human-readable message.
@@ -270,6 +284,10 @@ def parse_response_code(response_code: int):
         return RESPONSE_CODE_DICT[response_code]
     return "Unknown response code"
 
+
+
+
+
 def save_csv(csv_path: str, url: str):
     """
     Write a CSV file with the list of snapshots.
@@ -285,4 +303,4 @@ def save_csv(csv_path: str, url: str):
             row = csv.DictWriter(file, sc.SNAPSHOT_COLLECTION[0].keys())
             row.writeheader()
             for snapshot in sc.SNAPSHOT_COLLECTION:
-                row.writerow(snapshot)
+                row.writerow(snapshot)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "1.0.2"`
	`1`	`+__version__ = "1.0.3"`