Merge branch 'f/trim-json'

bitdruid · bitdruid · commit 3f58d6cd1cc0 · 2024-04-03T17:05:07.000+02:00
diff --git a/README.md b/README.md
@@ -31,9 +31,7 @@ Internet-archive is a nice source for several OSINT-information. This script is
 
 This script allows you to download content from the Wayback Machine (archive.org). You can use it to download either the latest version or all versions of web page snapshots within a specified range.
 
-## Info
-
-- The script will only request status code 200 snapshots (for now) - but this can differ from the status code when downloading the file.
+<!-- ## Info -->
 
 ### Arguments
 
@@ -65,8 +63,7 @@ Specify the range in years or a specific timestamp either start, end or both. If
 
 #### Additional
 
-- `--redirect`: Follow redirects of snapshots. Default is False. If a source has not statuscode 200, archive.org will redirect to the closest snapshot. So when setting this to `true`, parts of a timestamp-folder may not truly belong to the given timestamp.
-<!-- - `--harvest`: The downloaded files are scanned for locations on the same domain. These locations (mostly resources) are then tried to be accessed within the same timestamp. Setting this to `true` may result in identical files in different timestamps but you may get a more complete snapshot of the website. -->
+- `--no-redirect`: Do not follow redirects of snapshots. Archive.org sometimes redirects to a different snapshot for several reasons. Downloading redirects may lead to timestamp-folders which contain some files with a different timestamp. This does not matter if you only want to download the latest version (`-c`).
 - `--verbosity [LEVEL]`: Set the verbosity: json (print json response), progress (show progress bar) or standard (default).
 - `--retry [RETRY_FAILED]`: Retry failed downloads. You can specify the number of retry attempts as an integer.
 - `--worker [AMOUNT]`: The number of worker to use for downloading (simultaneous downloads). Default is 1. Beware: Using too many worker will lead into refused connections from the Wayback Machine. Duration about 1.5 minutes.
diff --git a/pywaybackup/SnapshotCollection.py b/pywaybackup/SnapshotCollection.py
@@ -9,7 +9,7 @@ class SnapshotCollection:
 
     @classmethod
     def create_list_full(cls, cdxResult):
-        cls.CDX_LIST = sorted([{"timestamp": snapshot[0], "url": snapshot[1]} for i, snapshot in enumerate(cdxResult.json()[1:])], key=lambda k: k['timestamp'], reverse=True)
+        cls.CDX_LIST = sorted([{"timestamp": snapshot[0], "url": snapshot[1], "status": snapshot[2], "mimetype": snapshot[3], "digest": snapshot[4]} for i, snapshot in enumerate(cdxResult.json()[1:])], key=lambda k: k['timestamp'], reverse=True)
 
     @classmethod
     def create_list_current(cls):
@@ -38,9 +38,7 @@ def create_collection(cls):
                 "url_origin": url,
                 "file": False,
                 "redirect": False,
-                "http_code": False,
-                "http_message": False,
-                "retry": False
+                "response": False
             }
             cls.SNAPSHOT_COLLECTION.append(collection_entry)
     
@@ -80,8 +78,8 @@ def url_get_timestamp(cls, url):
         """
         Extract the timestamp from a wayback machine URL.
         """
-        import re
-        timestamp = re.search(r'web.archive.org/web/(\d+)/', url).group(1)
+        timestamp = url.split("web.archive.org/web/")[1].split("/")[0]
+        timestamp = ''.join([char for char in timestamp if char.isdigit()])
         return timestamp
 
     @classmethod
diff --git a/pywaybackup/__version__.py b/pywaybackup/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.0"
+__version__ = "0.7.1"
diff --git a/pywaybackup/archive.py b/pywaybackup/archive.py
@@ -95,7 +95,7 @@ def query_list(url: str, range: int, start: int, end: int, explicit: bool, mode:
         else: 
             query_range = "&from=" + str(datetime.now().year - range)
         cdx_url = f"*.{url}/*" if not explicit else f"{url}"
-        cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url={cdx_url}{query_range}&fl=timestamp,original,statuscode&filter!=statuscode:200"
+        cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url={cdx_url}{query_range}&fl=timestamp,original,statuscode,mimetype,digest&filter!=statuscode:200"
         cdxResult = requests.get(cdxQuery)
         sc.create_list_full(cdxResult)
         sc.create_list_current() if mode == "current" else None
@@ -108,7 +108,7 @@ def query_list(url: str, range: int, start: int, end: int, explicit: bool, mode:
 
 
 # example download: http://web.archive.org/web/20190815104545id_/https://www.google.com/
-def download_list(output, retry, redirect, worker):
+def download_list(output, retry, no_redirect, worker):
     """
     Download a list of urls in format: [{"timestamp": "20190815104545", "url": "https://www.google.com/"}]
     """
@@ -127,13 +127,13 @@ def download_list(output, retry, redirect, worker):
     worker = 0
     for batch in batch_list:
         worker += 1
-        thread = threading.Thread(target=download_loop, args=(batch, output, worker, retry, redirect))
+        thread = threading.Thread(target=download_loop, args=(batch, output, worker, retry, no_redirect))
         threads.append(thread)
         thread.start()
     for thread in threads:
         thread.join()
 
-def download_loop(snapshot_batch, output, worker, retry, redirect, attempt=1, connection=None):
+def download_loop(snapshot_batch, output, worker, retry, no_redirect, attempt=1, connection=None):
     """
     Download a list of URLs in a recursive loop. If a download fails, the function will retry the download.
     The "snapshot_collection" dictionary will be updated with the download status and file information.
@@ -149,9 +149,8 @@ def download_loop(snapshot_batch, output, worker, retry, redirect, attempt=1, co
         return
     for snapshot in snapshot_batch:
         status = f"\n-----> Attempt: [{attempt}/{max_attempt}] Snapshot [{snapshot_batch.index(snapshot)+1}/{len(snapshot_batch)}] - Worker: {worker}"
-        download_status = download(output, snapshot, connection, status, redirect)
+        download_status = download(output, snapshot, connection, status, no_redirect)
         if not download_status:
-            if retry > 0: sc.snapshot_entry_modify(snapshot, "retry", attempt)
             failed_urls.append(snapshot)
         if download_status:
             v.write(progress=1)
@@ -160,9 +159,9 @@ def download_loop(snapshot_batch, output, worker, retry, redirect, attempt=1, co
         if not attempt > max_attempt: 
             v.write(f"\n-----> Worker: {worker} - Retry Timeout: 10 seconds")
             time.sleep(15)
-        download_loop(failed_urls, output, worker, retry, redirect, attempt, connection)
+        download_loop(failed_urls, output, worker, retry, no_redirect, attempt, connection)
 
-def download(output, snapshot_entry, connection, status_message, redirect=False):
+def download(output, snapshot_entry, connection, status_message, no_redirect=False):
     """
     Download a single URL and save it to the specified filepath.
     If there is a redirect, the function will follow the redirect and update the download URL.
@@ -180,9 +179,8 @@ def download(output, snapshot_entry, connection, status_message, redirect=False)
             response_data = response.read()
             response_status = response.status
             response_status_message = parse_response_code(response_status)
-            sc.snapshot_entry_modify(snapshot_entry, "http_code", response_status)
-            sc.snapshot_entry_modify(snapshot_entry, "http_message", response_status_message)
-            if redirect:
+            sc.snapshot_entry_modify(snapshot_entry, "response", response_status)
+            if not no_redirect:
                 if response_status == 302:
                     status_message = f"{status_message}\n" + \
                         f"REDIRECT   -> HTTP: {response.status}"
@@ -242,20 +240,20 @@ def download(output, snapshot_entry, connection, status_message, redirect=False)
     v.write(f"FAILED  -> download, append to failed_urls: {download_url}")
     return False
 
+RESPONSE_CODE_DICT = {
+    200: "OK",
+    301: "Moved Permanently",
+    302: "Found (redirect)",
+    400: "Bad Request",
+    403: "Forbidden",
+    404: "Not Found",
+    500: "Internal Server Error",
+    503: "Service Unavailable"
+}
 def parse_response_code(response_code: int):
     """
     Parse the response code of the Wayback Machine and return a human-readable message.
     """
-    response_code_dict = {
-        200: "OK",
-        301: "Moved Permanently",
-        302: "Found (redirect)",
-        400: "Bad Request",
-        403: "Forbidden",
-        404: "Not Found",
-        500: "Internal Server Error",
-        503: "Service Unavailable"
-    }
-    if response_code in response_code_dict:
-        return response_code_dict[response_code]
+    if response_code in RESPONSE_CODE_DICT:
+        return RESPONSE_CODE_DICT[response_code]
     return "Unknown response code"
diff --git a/pywaybackup/arguments.py b/pywaybackup/arguments.py
@@ -22,8 +22,7 @@ def parse():
     optional.add_argument('--end', type=int, help='End timestamp format: YYYYMMDDhhmmss')
 
     special = parser.add_argument_group('special')
-    special.add_argument('--redirect', action='store_true', help='Follow redirects by archive.org')
-    # special.add_argument('--harvest', action='store_true', help='Harvest location tags from snapshots and try to get as much as possible')
+    special.add_argument('--no-redirect', action='store_true', help='Do not follow redirects by archive.org')
     special.add_argument('--verbosity', type=str, default="standard", choices=["standard", "progress", "json"], help='Verbosity level')
     special.add_argument('--retry', type=int, default=0, metavar="X-TIMES", help='Retry failed downloads (opt tries as int, else infinite)')
     special.add_argument('--worker', type=int, default=1, metavar="AMOUNT", help='Number of worker (simultaneous downloads)')
diff --git a/pywaybackup/main.py b/pywaybackup/main.py
@@ -23,7 +23,7 @@ def main():
         if args.list:
             archive.print_list()
         else:
-            archive.download_list(args.output, args.retry, args.redirect, args.worker)            
+            archive.download_list(args.output, args.retry, args.no_redirect, args.worker)            
             #archive.remove_empty_folders(args.output)
     v.close()
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.0"`
	`1`	`+__version__ = "0.7.1"`