@@ -81,26 +81,30 @@ def print_result(result_list):
81
81
82
82
# create filelist
83
83
def query_list (url : str , range : int , mode : str ):
84
- print ("\n Querying snapshots..." )
85
- if range :
86
- range = datetime .now ().year - range
87
- range = "&from=" + str (range )
88
- else :
89
- range = ""
90
- cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url=*.{ url } /*{ range } &fl=timestamp,original&filter=!statuscode:200"
91
- cdxResult = requests .get (cdxQuery )
92
- if cdxResult .status_code != 200 : print (f"\n -----> ERROR: could not query snapshots, status code: { cdxResult .status_code } " ); exit ()
93
- cdxResult_json = cdxResult .json ()[1 :] # first line is fieldlist, so remove it [timestamp, original
94
- cdxResult_list = [{"timestamp" : snapshot [0 ], "url" : snapshot [1 ]} for snapshot in cdxResult_json ]
95
- if mode == "current" :
96
- cdxResult_list = sorted (cdxResult_list , key = lambda k : k ['timestamp' ], reverse = True )
97
- cdxResult_list_filtered = []
98
- for snapshot in cdxResult_list :
99
- if snapshot ["url" ] not in [snapshot ["url" ] for snapshot in cdxResult_list_filtered ]:
100
- cdxResult_list_filtered .append (snapshot )
101
- cdxResult_list = cdxResult_list_filtered
102
- print (f"\n -----> { len (cdxResult_list )} snapshots found" )
103
- return cdxResult_list
84
+ try :
85
+ print ("\n Querying snapshots..." )
86
+ if range :
87
+ range = datetime .now ().year - range
88
+ range = "&from=" + str (range )
89
+ else :
90
+ range = ""
91
+ cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url=*.{ url } /*{ range } &fl=timestamp,original&filter=!statuscode:200"
92
+ cdxResult = requests .get (cdxQuery )
93
+ if cdxResult .status_code != 200 : print (f"\n -----> ERROR: could not query snapshots, status code: { cdxResult .status_code } " ); exit ()
94
+ cdxResult_json = cdxResult .json ()[1 :] # first line is fieldlist, so remove it [timestamp, original
95
+ cdxResult_list = [{"timestamp" : snapshot [0 ], "url" : snapshot [1 ]} for snapshot in cdxResult_json ]
96
+ if mode == "current" :
97
+ cdxResult_list = sorted (cdxResult_list , key = lambda k : k ['timestamp' ], reverse = True )
98
+ cdxResult_list_filtered = []
99
+ for snapshot in cdxResult_list :
100
+ if snapshot ["url" ] not in [snapshot ["url" ] for snapshot in cdxResult_list_filtered ]:
101
+ cdxResult_list_filtered .append (snapshot )
102
+ cdxResult_list = cdxResult_list_filtered
103
+ print (f"\n -----> { len (cdxResult_list )} snapshots found" )
104
+ return cdxResult_list
105
+ except requests .exceptions .ConnectionError as e :
106
+ print (f"\n -----> ERROR: could not query snapshots:\n { e } " ); exit ()
107
+
104
108
105
109
106
110
@@ -296,11 +300,18 @@ def download_url_entry(url, filename, filepath, connection, status_message):
296
300
f" -> URL: { url } \n " + \
297
301
f" -> FILE: { output } "
298
302
return True
299
- except http .client .HTTPException as e :
303
+ except ConnectionRefusedError as e :
304
+ status_message = f"{ status_message } \n " + \
305
+ f"REFUSED -> ({ i + 1 } /{ max_retries } ), reconnect in { sleep_time } seconds...\n " + \
306
+ f" -> { e } "
300
307
print (status_message )
301
- print (f"REFUSED -> ({ i + 1 } /{ max_retries } ), reconnect in { sleep_time } seconds..." )
302
- print (f" -> { e } " )
303
308
time .sleep (sleep_time )
309
+ except http .client .HTTPException as e :
310
+ status_message = f"{ status_message } \n " + \
311
+ f"EXCEPTION -> ({ i + 1 } /{ max_retries } ), append to failed_urls: { url } \n " + \
312
+ f" -> { e } "
313
+ print (status_message )
314
+ return False
304
315
print (f"FAILED -> download, append to failed_urls: { url } " )
305
316
return False
306
317
0 commit comments