Skip to content

Commit ede9ba2

Browse files
committed
Merge branch 'h/exception-handling'
1 parent 532e136 commit ede9ba2

File tree

2 files changed

+35
-24
lines changed

2 files changed

+35
-24
lines changed

pywaybackup/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.0"
1+
__version__ = "0.5.1"

pywaybackup/archive.py

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -81,26 +81,30 @@ def print_result(result_list):
8181

8282
# create filelist
8383
def query_list(url: str, range: int, mode: str):
84-
print("\nQuerying snapshots...")
85-
if range:
86-
range = datetime.now().year - range
87-
range = "&from=" + str(range)
88-
else:
89-
range = ""
90-
cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url=*.{url}/*{range}&fl=timestamp,original&filter=!statuscode:200"
91-
cdxResult = requests.get(cdxQuery)
92-
if cdxResult.status_code != 200: print(f"\n-----> ERROR: could not query snapshots, status code: {cdxResult.status_code}"); exit()
93-
cdxResult_json = cdxResult.json()[1:] # first line is fieldlist, so remove it [timestamp, original
94-
cdxResult_list = [{"timestamp": snapshot[0], "url": snapshot[1]} for snapshot in cdxResult_json]
95-
if mode == "current":
96-
cdxResult_list = sorted(cdxResult_list, key=lambda k: k['timestamp'], reverse=True)
97-
cdxResult_list_filtered = []
98-
for snapshot in cdxResult_list:
99-
if snapshot["url"] not in [snapshot["url"] for snapshot in cdxResult_list_filtered]:
100-
cdxResult_list_filtered.append(snapshot)
101-
cdxResult_list = cdxResult_list_filtered
102-
print(f"\n-----> {len(cdxResult_list)} snapshots found")
103-
return cdxResult_list
84+
try:
85+
print("\nQuerying snapshots...")
86+
if range:
87+
range = datetime.now().year - range
88+
range = "&from=" + str(range)
89+
else:
90+
range = ""
91+
cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url=*.{url}/*{range}&fl=timestamp,original&filter=!statuscode:200"
92+
cdxResult = requests.get(cdxQuery)
93+
if cdxResult.status_code != 200: print(f"\n-----> ERROR: could not query snapshots, status code: {cdxResult.status_code}"); exit()
94+
cdxResult_json = cdxResult.json()[1:] # first line is fieldlist, so remove it [timestamp, original
95+
cdxResult_list = [{"timestamp": snapshot[0], "url": snapshot[1]} for snapshot in cdxResult_json]
96+
if mode == "current":
97+
cdxResult_list = sorted(cdxResult_list, key=lambda k: k['timestamp'], reverse=True)
98+
cdxResult_list_filtered = []
99+
for snapshot in cdxResult_list:
100+
if snapshot["url"] not in [snapshot["url"] for snapshot in cdxResult_list_filtered]:
101+
cdxResult_list_filtered.append(snapshot)
102+
cdxResult_list = cdxResult_list_filtered
103+
print(f"\n-----> {len(cdxResult_list)} snapshots found")
104+
return cdxResult_list
105+
except requests.exceptions.ConnectionError as e:
106+
print(f"\n-----> ERROR: could not query snapshots:\n{e}"); exit()
107+
104108

105109

106110

@@ -296,11 +300,18 @@ def download_url_entry(url, filename, filepath, connection, status_message):
296300
f" -> URL: {url}\n" + \
297301
f" -> FILE: {output}"
298302
return True
299-
except http.client.HTTPException as e:
303+
except ConnectionRefusedError as e:
304+
status_message = f"{status_message}\n" + \
305+
f"REFUSED -> ({i+1}/{max_retries}), reconnect in {sleep_time} seconds...\n" + \
306+
f" -> {e}"
300307
print(status_message)
301-
print(f"REFUSED -> ({i+1}/{max_retries}), reconnect in {sleep_time} seconds...")
302-
print(f" -> {e}")
303308
time.sleep(sleep_time)
309+
except http.client.HTTPException as e:
310+
status_message = f"{status_message}\n" + \
311+
f"EXCEPTION -> ({i+1}/{max_retries}), append to failed_urls: {url}\n" + \
312+
f" -> {e}"
313+
print(status_message)
314+
return False
304315
print(f"FAILED -> download, append to failed_urls: {url}")
305316
return False
306317

0 commit comments

Comments
 (0)