Merge branch 'f/save-pages-to-archive'

bitdruid@vbox · bitdruid@vbox · commit 532e1361a156 · 2024-02-26T10:18:12.000+01:00
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ Internet-archive is a nice source for several OSINT-information. This script is
 ### Manual
 
 1. Clone the repository <br>
-   ```git clone https://github.com/bitdruid/waybackup.git```
+   ```git clone https://github.com/bitdruid/python-wayback-machine-downloader.git```
 2. Install <br>
    ```pip install .```
    - in a virtual env or use `--break-system-package`
@@ -42,8 +42,9 @@ This script allows you to download content from the Wayback Machine (archive.org
 
 #### Mode Selection (Choose One)
 
-- `-c`, `--current`: Download the latest version of each file snapshot. This option is mutually exclusive with `-f/--full`.
-- `-f`, `--full`: Download snapshots of all timestamps. This option is mutually exclusive with `-c/--current`.
+- `-c`, `--current`: Download the latest version of each file snapshot.
+- `-f`, `--full`: Download snapshots of all timestamps.
+- `-s`, `--save`: Save a page to the Wayback Machine. (beta)
 
 #### Optional Arguments
 
diff --git a/pywaybackup/__version__.py b/pywaybackup/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.2"
+__version__ = "0.5.0"
diff --git a/pywaybackup/archive.py b/pywaybackup/archive.py
@@ -1,12 +1,71 @@
 #import threading
 import requests
-import datetime
 import os
 import magic
 import threading
 import time
 import http.client
 from urllib.parse import urljoin
+from datetime import datetime, timezone
+
+
+
+
+# GET: store page to wayback machine and response with redirect to snapshot
+# POST: store page to wayback machine and response with wayback machine status-page
+# tag_jobid = '<script>spn.watchJob("spn2-%s", "/_static/",6000);</script>'
+# tag_result_timeout = '<p>The same snapshot had been made %s minutes ago. You can make new capture of this URL after 1 hour.</p>'
+# tag_result_success = ' A snapshot was captured. Visit page: <a href="%s">%s</a>'
+def save_page(url: str):
+    """
+    Saves a webpage to the Wayback Machine. 
+
+    Args:
+        url (str): The URL of the webpage to be saved.
+
+    Returns:
+        None: The function does not return any value. It only prints messages to the console.
+    """
+    print("\nSaving page to the Wayback Machine...")
+    connection = http.client.HTTPSConnection("web.archive.org")
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
+    }
+    connection.request("GET", f"https://web.archive.org/save/{url}", headers=headers)
+    print("\n-----> Request sent")
+    response = connection.getresponse()
+    response_status = response.status
+
+    if response_status == 302:
+        location = response.getheader("Location")
+        print("\n-----> Response: 302 (redirect to snapshot)")
+        snapshot_timestamp = datetime.strptime(location.split('/web/')[1].split('/')[0], '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S')
+        current_timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
+        timestamp_difference = (datetime.strptime(current_timestamp, '%Y-%m-%d %H:%M:%S') - datetime.strptime(snapshot_timestamp, '%Y-%m-%d %H:%M:%S')).seconds / 60
+        timestamp_difference = int(round(timestamp_difference, 0))
+
+        if timestamp_difference < 1:
+            print("\n-----> New snapshot created")
+        elif timestamp_difference > 1:
+            print(f"\n-----> Snapshot already exists. (1 hour limit) - wait for {60 - timestamp_difference} minutes")
+            print(f"TIMESTAMP SNAPSHOT: {snapshot_timestamp}")
+            print(f"TIMESTAMP REQUEST : {current_timestamp}")
+            print(f"\nLAST SNAPSHOT BACK: {timestamp_difference} minutes")
+
+        print(f"\nURL: {location}")
+
+    elif response_status == 404:
+        print("\n-----> Response: 404 (not found)")
+        print(f"\nFAILED -> URL: {url}")
+    else:
+        print("\n-----> Response: unexpected")
+        print(f"\nFAILED -> URL: {url}")
+
+    connection.close()
+
+
+
+
 
 def print_result(result_list):
     print("")
@@ -16,11 +75,15 @@ def print_result(result_list):
         __import__('pprint').pprint(result_list)
         print(f"\n-----> {len(result_list)} snapshots listed")
 
+
+
+
+
 # create filelist
 def query_list(url: str, range: int, mode: str):
     print("\nQuerying snapshots...")
     if range:
-        range = datetime.datetime.now().year - range
+        range = datetime.now().year - range
         range = "&from=" + str(range)
     else:
         range = ""
diff --git a/pywaybackup/main.py b/pywaybackup/main.py
@@ -12,6 +12,7 @@ def main():
     exclusive_required = required.add_mutually_exclusive_group(required=True)
     exclusive_required.add_argument('-c', '--current', action='store_true', help='Download the latest version of each file snapshot (opt range in y)')
     exclusive_required.add_argument('-f', '--full', action='store_true', help='Download snapshots of all timestamps (opt range in y)')
+    exclusive_required.add_argument('-s', '--save', action='store_true', help='Save a page to the wayback machine')
     optional = parser.add_argument_group('optional')
     optional.add_argument('-l', '--list', action='store_true', help='Only print snapshots (opt range in y)')
     optional.add_argument('-r', '--range', type=int, help='Range in years to search')
@@ -27,16 +28,20 @@ def main():
     if args.full:
         mode = "full"
 
-    if args.output is None:
-        args.output = os.path.join(os.getcwd(), "waybackup_snapshots")
-    cdxResult_list = archive.query_list(args.url, args.range, mode)
-    if args.list:
-        archive.print_result(cdxResult_list)
-    if not args.list:
-        archive.download_prepare_list(cdxResult_list, args.output, args.retry, args.worker, mode)
-        archive.remove_empty_folders(args.output)
-    # if args.detect_filetype:
-    #     archive.detect_filetype(args.output)
+    if args.save:
+        archive.save_page(args.url)
+    else:
+        if args.output is None:
+            args.output = os.path.join(os.getcwd(), "waybackup_snapshots")
+        cdxResult_list = archive.query_list(args.url, args.range, mode)
+        if args.list:
+            archive.print_result(cdxResult_list)
+        if not args.list and not args.save:
+            archive.download_prepare_list(cdxResult_list, args.output, args.retry, args.worker, mode)
+            archive.remove_empty_folders(args.output)
+        # if args.detect_filetype:
+        #     archive.detect_filetype(args.output)
+    print("")
 
 if __name__ == "__main__":
     main()
diff --git a/setup.py b/setup.py
@@ -31,5 +31,5 @@ def parse_requirements(filename):
     long_description_content_type='text/markdown',
     license='MIT',
     keywords='wayback machine internet archive',
-    url='https://github.com/bitdruid/waybackup',
+    url='https://github.com/bitdruid/python-wayback-machine-downloader',
 )

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.2"`
	`1`	`+__version__ = "0.5.0"`
Original file line number	Diff line number	Diff line change
`@@ -31,5 +31,5 @@ def parse_requirements(filename):`
`31`	`31`	`long_description_content_type='text/markdown',`
`32`	`32`	`license='MIT',`
`33`	`33`	`keywords='wayback machine internet archive',`
`34`		`- url='https://github.com/bitdruid/waybackup',`
	`34`	`+ url='https://github.com/bitdruid/python-wayback-machine-downloader',`
`35`	`35`	`)`