bitdruid
diff --git a/‎README.md
+47-48 b/‎README.md
+47-48
diff --git a/‎pywaybackup/Arguments.py
+30-38 b/‎pywaybackup/Arguments.py
+30-38
diff --git a/‎pywaybackup/Converter.py
-1 b/‎pywaybackup/Converter.py
-1
@@ -32,7 +32,7 @@ This tool allows you to download content from the Wayback Machine (archive.org).
 
 - Linux recommended: On Windows machines, the path length is limited. This can only be overcome by editing the registry. Files that exceed the path length will not be downloaded.
 - If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
-- The tool will inform you if your query has an immense amount of snapshots which could consume your system memory and lead to a crash. Consider splitting your query into smaller jobs by specifying a range e.g. `--start 2023 --end 2024` or `--range 1`.
+- The tool uses a sqlite database to handle snapshots. The database will only persist while the download is running.
 
 ## Arguments
 
@@ -54,12 +54,14 @@ This tool allows you to download content from the Wayback Machine (archive.org).
 
 ### Optional query parameters
 
-- **`-l`**, **`--list`**:<br>
-  Only print the snapshots available within the specified range. Does not download the snapshots.
 - **`-e`**, **`--explicit`**:<br>
   Only download the explicit given URL. No wildcard subdomains or paths. Use e.g. to get root-only snapshots. This is recommended for explicit files like `login.html` or `?query=this`.
-- **`-o`**, **`--output`**:<br>
-  Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
+
+- **`--filetype`** `<filetype>`:<br>
+  Specify filetypes to download. Default is all filetypes. Separate multiple filetypes with a comma. Example: `--filetype jpg,css,js`. A filter will result in a filtered cdx-file. So if you want to download all files later, you need to query again without the filter. Filetypes are filtered as they are in the snapshot. So if there is no explicit `html` file in the path (common practice) then you cant filter them.
+
+- **`--limit`** `<count>`:<br>
+Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected, the limit will have no effect. So you would need to set `--keep`.
 
 - **Range Selection:**<br>
   Specify the range in years or a specific timestamp either start, end, or both. If you specify the `range` argument, the `start` and `end` arguments will be ignored. Format for timestamps: YYYYMMDDhhmmss. You can only give a year or increase specificity by going through the timestamp starting on the left.<br>
@@ -71,72 +73,68 @@ This tool allows you to download content from the Wayback Machine (archive.org).
    - **`--end`**:<br>
      Timestamp to end searching.
 
-### Additional behavior manipulation
-  
-- **`--csv`** `<path>`:<br>
-Path defaults to output-dir. Saves a CSV file with the json-response for successfull downloads. If `--list` is set, the CSV contains the CDX list of snapshots. If `--current` or `--full` is set, CSV contains downloaded files. Named as `waybackup_<sanitized_url>.csv`.
+### Behavior manipulation
 
-- **`--skip`** `<path>`:<br>
-Path defaults to output-dir. Checks for an existing `waybackup_<sanitized_url>.csv` for URLs to skip downloading. Useful for interrupted downloads. Files are checked by their root-domain, ensuring consistency across queries. This means that if you download `http://example.com/subdir1/` and later `http://example.com`, the second query will skip the first path.
-  
-- **`--no-redirect`**:<br>
-Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
-  
-- **`--verbosity`** `<level>`:<br>
-Sets verbosity level. Options are `json` (prints JSON response) or `progress` (shows progress bar).
-<!-- Alternatively set verbosity level to `trace` for a very detailed output. -->
+- **`-o`**, **`--output`**:<br>
+Defaults to `waybackup_snapshots` in the current directory. The folder where downloaded files will be saved.
 
-- **`--log`** `<path>`:<br>
-Path defaults to output-dir. Saves a log file with the output of the tool. Named as `waybackup_<sanitized_url>.log`.
+<!-- - **`--verbosity`** `<level>`:<br>
+Sets verbosity level. Options are `info`and `trace`. Default is `info`. -->
+
+- **`--log`** <!-- `<path>` -->:<br>
+Saves a log file into the output-dir. Named as `waybackup_<sanitized_url>.log`.
+
+- **`--progress`**:<br>
+Shows a progress bar instead of the default output.
 
 - **`--workers`** `<count>`:<br>
 Sets the number of simultaneous download workers. Default is 1, safe range is about 10. Be cautious as too many workers may lead to refused connections from the Wayback Machine.
+
+- **`--no-redirect`**:<br>
+Disables following redirects of snapshots. Useful for preventing timestamp-folder mismatches caused by Archive.org redirects.
 
 - **`--retry`** `<attempts>`:<br>
 Specifies number of retry attempts for failed downloads.
 
 - **`--delay`** `<seconds>`:<br>
 Specifies delay between download requests in seconds. Default is no delay (0).
 
-- **`--limit`** `<count>`:<br>
-Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected (with `--cdxinject` or `--auto`), the limit will have no effect.
-
 <!-- - **`--convert-links`**:<br>
 If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->
 
-**CDX Query Result Handling:**
-- **`--cdxbackup`** `<path>`:<br>
-Path defaults to output-dir. Saves the result of CDX query as a file. Useful for later downloading snapshots and overcoming refused connections by CDX server due to too many queries. Named as `waybackup_<sanitized_url>.cdx`.
-  
-- **`--cdxinject`** `<filepath>`:<br>
-Injects a CDX query file to download snapshots. Ensure the query matches the previous `--url` for correct folder structure.
+## Special:
+
+- **`--reset`**:  
+  If set, the job will be reset, and any existing `cdx`, `db`, `csv` files will be **deleted**. This allows you to start the job from scratch without considering previously downloaded data.
 
-**Auto:**
-- **`--auto`**:<br>
-If set, csv, skip and cdxbackup/cdxinject are handled automatically. Keep the files and folders as they are. Otherwise they will not be recognized when restarting a download.
+- **`--keep`**:  
+  If set, all files will be kept after the job is finished. This includes the `cdx` and `db` file. Without this argument, they will be deleted if the job finished successfully.
 
 ### Examples
 
-Download latest snapshot of all files:<br>
+Download the latest snapshot of all available files:<br>
 `waybackup -u http://example.com -c`
 
-Download latest snapshot of a specific file:<br>
-`waybackup -u http://example.com/subdir/file.html -c`
+Download the latest snapshot of a specific file (e.g., a login page):<br>
+`waybackup -u http://example.com/login.html -c --explicit`
 
-Download all snapshots sorted per timestamp with a specified range and do not follow redirects:<br>
+Download all snapshots within the last 5 years and prevent redirects:<br>
 `waybackup -u http://example.com -f -r 5 --no-redirect`
 
-Download all snapshots sorted per timestamp with a specified range and save to a specified folder with 3 workers:<br>
+Download all snapshots from a specific range (2020 to December 12, 2022) with 4 workers, and show a progress bar:<br>
+`waybackup -u http://example.com -f --start 2020 --end 20221212 --workers 4 --progress`
+
+Download all snapshots and save the output in a specific folder with 3 workers:<br>
 `waybackup -u http://example.com -f -r 5 -o /home/user/Downloads/snapshots --workers 3`
 
-Download all snapshots from 2020 to 12th of December 2022 with 4 workers, save a csv and show a progress bar:
-`waybackup -u http://example.com -f --start 2020 --end 20221212 --workers 4 --csv --verbosity progress`
+Download all snapshots but only images and CSS files, filtering for specific filetypes (jpg, css):<br>
+`waybackup -u http://example.com -f --filetype jpg,css`
 
-Download all snapshots and output a json response:<br>
-`waybackup -u http://example.com -f --verbosity json`
+Download all timestamps but start over and ignore existing progress, log the output, and retry 3 times if any error occurs:<br>
+`waybackup -u http://example.com -f --log --retry 3 --reset`
 
-List available snapshots per timestamp without downloading and save a csv file to home folder:<br>
-`waybackup -u http://example.com -f -l --csv /home/user/Downloads`
+Download the latest snapshot, follow no redirects but keep the database and cdx-file:<br>
+`waybackup -u http://example.com -c --no-redirect --keep`
 
 ## Output path structure
 
@@ -175,8 +173,9 @@ your/path/waybackup_snapshots/
     ...
 ```
 
+## CSV Output
 
-### Json Response
+Each snapshot is stored with the following keys/values. These are either stored in a sqlite database while the download is running or saved into a CSV file after the download is finished.
 
 For download queries:
 
@@ -212,14 +211,14 @@ For list queries:
 ]
 ```
 
-## CSV Output
-
-The csv contains the json response in a table format.
-
 ### Debugging
 
 Exceptions will be written into `waybackup_error.log` (each run overwrites the file).
 
+### Known ToDos
+
+- [ ] currently there is no logic to handle if both a http and https version of a page is available
+
 ## Contributing
 
 I'm always happy for some feature requests to improve the usability of this tool.
 
@@ -22,35 +22,34 @@ def __init__(self):
         exclusive_required.add_argument('-s', '--save', action='store_true', help='save a page to the wayback machine')
 
         optional = parser.add_argument_group('optional query parameters')
-        optional.add_argument('-l', '--list', action='store_true', help='only print snapshots (opt range in y)')
         optional.add_argument('-e', '--explicit', action='store_true', help='search only for the explicit given url')
-        optional.add_argument('-o', '--output', type=str, metavar="", help='output folder - defaults to current directory')
         optional.add_argument('-r', '--range', type=int, metavar="", help='range in years to search')
         optional.add_argument('--start', type=int, metavar="", help='start timestamp format: YYYYMMDDhhmmss')
         optional.add_argument('--end', type=int, metavar="", help='end timestamp format: YYYYMMDDhhmmss')
-
-        special = parser.add_argument_group('manipulate behavior')
-        special.add_argument('--csv', type=str, nargs='?', const=True, metavar='path', help='save a csv file with the json output - defaults to output folder')
-        special.add_argument('--skip', type=str, nargs='?', const=True, metavar='path', help='skips existing files in the output folder by checking the .csv file - defaults to output folder')
-        special.add_argument('--no-redirect', action='store_true', help='do not follow redirects by archive.org')
-        special.add_argument('--verbosity', type=str, default="info", metavar="", help='["progress", "json"] for different output or ["trace"] for very detailed output')
-        special.add_argument('--log', type=str, nargs='?', const=True, metavar='path', help='save a log file - defaults to output folder')
-        special.add_argument('--retry', type=int, default=0, metavar="", help='retry failed downloads (opt tries as int, else infinite)')
-        special.add_argument('--workers', type=int, default=1, metavar="", help='number of workers (simultaneous downloads)')
-        # special.add_argument('--convert-links', action='store_true', help='Convert all links in the files to local paths. Requires -c/--current')
-        special.add_argument('--delay', type=int, default=0, metavar="", help='delay between each download in seconds')
-        special.add_argument('--limit', type=int, nargs='?', const=True, metavar='int', help='limit the number of snapshots to download')
-
-        cdx = parser.add_argument_group('cdx (one exclusive)')
-        exclusive_cdx = cdx.add_mutually_exclusive_group()
-        exclusive_cdx.add_argument('--cdxbackup', type=str, nargs='?', const=True, metavar='path', help='Save the cdx query-result to a file for recurent use - defaults to output folder')
-        exclusive_cdx.add_argument('--cdxinject', type=str, nargs='?', const=True, metavar='path', help='Inject a cdx backup-file to download according to the given url')
-
-        auto = parser.add_argument_group('auto')
-        auto.add_argument('--auto', action='store_true', help='includes automatic csv, skip and cdxbackup/cdxinject to resume a stopped download')
+        optional.add_argument('--filetype', type=str, metavar="", help='filetypes to download comma separated (e.g. "html,css")')
+        optional.add_argument('--limit', type=int, nargs='?', const=True, metavar='int', help='limit the number of snapshots to download')
+
+        behavior = parser.add_argument_group('manipulate behavior')
+        behavior.add_argument('-o', '--output', type=str, metavar="", help='output folder - defaults to current directory')
+        behavior.add_argument('--log', action='store_true', help='save a log file into the output folder')
+        behavior.add_argument('--progress', action='store_true', help='show a progress bar')
+        behavior.add_argument('--no-redirect', action='store_true', help='do not follow redirects by archive.org')
+        #behavior.add_argument('--verbosity', type=str, default="info", metavar="", help='verbosity level (info, trace)')
+        behavior.add_argument('--retry', type=int, default=0, metavar="", help='retry failed downloads (opt tries as int, else infinite)')
+        behavior.add_argument('--workers', type=int, default=1, metavar="", help='number of workers (simultaneous downloads)')
+        # behavior.add_argument('--convert-links', action='store_true', help='Convert all links in the files to local paths. Requires -c/--current')
+        behavior.add_argument('--delay', type=int, default=0, metavar="", help='delay between each download in seconds')
+
+        special = parser.add_argument_group('special')
+        special.add_argument('--reset', action='store_true', help='reset the job and ignore existing cdx/db/csv files')
+        special.add_argument('--keep', action='store_true', help='keep all files after the job finished')
 
         args = parser.parse_args(args=None if sys.argv[1:] else ['--help']) # if no arguments are given, print help
 
+        required_args = {action.dest: getattr(args, action.dest) for action in exclusive_required._group_actions}
+        optional_args = {action.dest: getattr(args, action.dest) for action in optional._group_actions}
+        args.query_identifier = str(args.url) + str(required_args) + str(optional_args)
+
         # if args.convert_links and not args.current:
         #     parser.error("--convert-links can only be used with the -c/--current option")
 
@@ -84,21 +83,14 @@ def init(cls):
         if cls.current:
             cls.mode = "current"
 
-        cls.cdxbackup = cls.output if cls.cdxbackup is None else cls.cdxbackup
-
-        if cls.auto:
-            cls.skip = cls.output
-            cls.csv = cls.output
-            cls.cdxbackup = cls.output
-            cls.cdxinject = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.cdx")
-        else:
-            if cls.skip is True:
-                cls.skip = cls.output
-            if cls.csv is True:
-                cls.csv = cls.output
-            if cls.cdxbackup is True:
-                cls.cdxbackup = cls.output
-            if cls.cdxinject is True:
-                cls.cdxinject = cls.output
+        if cls.filetype:
+            cls.filetype = [ft.lower().strip() for ft in cls.filetype.split(",")]
 
+        cls.cdxfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.cdx")
+        cls.dbfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.db")
+        cls.csvfile = os.path.join(cls.output, f"waybackup_{sanitize_filename(cls.url)}.csv")
 
+        if cls.reset:
+            os.remove(cls.cdxfile) if os.path.isfile(cls.cdxfile) else None
+            os.remove(cls.dbfile) if os.path.isfile(cls.dbfile) else None
+            os.remove(cls.csvfile) if os.path.isfile(cls.csvfile) else None
@@ -4,7 +4,6 @@
 from pywaybackup.helper import url_split
 
 from pywaybackup.Arguments import Configuration as config
-from pywaybackup.Verbosity import Verbosity as vb
 import re
 
 class Converter: