3
3
4
4
class SnapshotCollection :
5
5
6
- CDX_RESULT_JSON = []
7
- CDX_RESULT_LIST = []
8
- CDX_RESULT_COLLECTION = []
6
+ CDX_JSON = []
7
+ CDX_LIST = []
8
+
9
+ SNAPSHOT_COLLECTION = []
9
10
10
11
MODE_CURRENT = 0
11
12
12
- def __init__ (self , cdxResult = None , cdxCollection = None ):
13
- if cdxResult :
14
- self . CDX_RESULT_JSON = cdxResult . json ()[ 1 :]
15
- self . CDX_RESULT_LIST = [{ "timestamp" : snapshot [ 0 ], "url" : snapshot [ 1 ]} for snapshot in self . CDX_RESULT_JSON ]
16
- self .CDX_RESULT_LIST = sorted ( self . CDX_RESULT_LIST , key = lambda k : k [ 'timestamp' ], reverse = True )
17
- if cdxCollection :
18
- self .CDX_RESULT_COLLECTION = cdxCollection
13
+ def __init__ (self ):
14
+ pass
15
+
16
+ def create_full ( self , cdxResult ):
17
+ self .CDX_JSON = cdxResult . json ()[ 1 :]
18
+ self . CDX_LIST = [{ "id" : i , "timestamp" : snapshot [ 0 ], "url" : snapshot [ 1 ]} for i , snapshot in enumerate ( self . CDX_JSON )]
19
+ self . CDX_LIST = sorted ( self .CDX_LIST , key = lambda k : k [ 'timestamp' ], reverse = True )
19
20
20
21
def create_current (self ):
21
22
self .MODE_CURRENT = 1
22
- self .CDX_RESULT_LIST = sorted (self .CDX_RESULT_LIST , key = lambda k : k ['timestamp' ], reverse = True )
23
+ self .CDX_LIST = sorted (self .CDX_LIST , key = lambda k : k ['timestamp' ], reverse = True )
23
24
cdxResult_list_filtered = []
24
- for snapshot in self .CDX_RESULT_LIST :
25
- if snapshot ["url" ] not in [snapshot ["url" ] for snapshot in cdxResult_list_filtered ]:
25
+ url_set = set ()
26
+ for snapshot in self .CDX_LIST :
27
+ if snapshot ["url" ] not in url_set :
26
28
cdxResult_list_filtered .append (snapshot )
27
- self .CDX_RESULT_LIST = cdxResult_list_filtered
29
+ url_set .add (snapshot ["url" ])
30
+ self .CDX_LIST = cdxResult_list_filtered
28
31
29
- def create_collection (self , output ) :
30
- for snapshot in self . CDX_RESULT_LIST :
31
- timestamp , url = snapshot [ "timestamp" ], snapshot [ " url" ]
32
- url_type = self . __get_url_filetype ( url )
33
- download_url = f"http://web.archive.org/web/ { timestamp } { url_type } / { url } "
34
- domain , subdir , filename = self . __split_url ( url )
35
- if self . MODE_CURRENT : download_dir = os .path .join (output , domain , subdir )
36
- else : download_dir = os .path .join (output , domain , timestamp , subdir )
37
- download_file = os . path . join ( download_dir , filename )
38
- self . CDX_RESULT_COLLECTION . append (
39
- {
40
- "index " : self . CDX_RESULT_LIST . index ( snapshot ) ,
41
- "url " : download_url ,
42
- "file " : str ( download_file ) ,
43
- "success" : False ,
44
- "retry" : 0
45
- }
46
- )
32
+ def create_entry (self , cdx_entry : dict , output : str ) -> dict :
33
+ timestamp , url = cdx_entry [ "timestamp" ], cdx_entry [ "url" ]
34
+ url_type = self . __get_url_filetype ( url )
35
+ download_url = f"http://web.archive.org/web/ { timestamp } { url_type } / { url } "
36
+ domain , subdir , filename = self . __split_url ( url )
37
+ if self . MODE_CURRENT : download_dir = os . path . join ( output , domain , subdir )
38
+ else : download_dir = os .path .join (output , domain , timestamp , subdir )
39
+ download_file = os .path .join (download_dir , filename )
40
+ cdx_entry = {
41
+ "id" : cdx_entry [ "id" ],
42
+ "url" : download_url ,
43
+ "file " : download_file ,
44
+ "timestamp " : timestamp ,
45
+ "origin_url " : url ,
46
+ "success" : False ,
47
+ "retry" : 0
48
+ }
49
+ return cdx_entry
47
50
48
51
def count_list (self ):
49
- return len (self .CDX_RESULT_LIST )
52
+ return len (self .CDX_LIST )
50
53
51
- def count_collection (self ):
52
- return len (self .CDX_RESULT_COLLECTION )
54
+ def snapshot_collection_write (self , query_entry : dict ):
55
+ if query_entry ["id" ] not in self .SNAPSHOT_COLLECTION :
56
+ self .SNAPSHOT_COLLECTION .append (query_entry )
53
57
54
- def set_value (self , index : int , key : str , value : str ):
55
- """
56
- Set a value in the collection
58
+ def snapshot_collection_update (self , id : int , key : str , value : str ):
59
+ index = next ((index for (index , d ) in enumerate (self .SNAPSHOT_COLLECTION ) if d ["id" ] == id ), None )
60
+ if index is not None :
61
+ self .SNAPSHOT_COLLECTION [index ][key ] = value
57
62
58
- Args:
59
- index (int): Index of the snapshot
60
- key (str): Key of the value
61
- value (str): Value to set
62
- """
63
- self .CDX_RESULT_COLLECTION [index ][key ] = value
64
-
65
63
def __get_url_filetype (self , url ):
66
- file_extension = url . split ( "." )[ - 1 ]
64
+ file_extension = os . path . splitext ( url )[ 1 ][ 1 : ]
67
65
urltype_mapping = {
68
66
"jpg" : "im_" ,
69
67
"jpeg" : "im_" ,
@@ -80,6 +78,6 @@ def __get_url_filetype(self, url):
80
78
def __split_url (self , url ):
81
79
parsed_url = urlparse (url )
82
80
domain = parsed_url .netloc
83
- subdir = parsed_url .path .strip ("/" )
81
+ subdir = parsed_url .path .strip ("/" ). rsplit ( "/" , 1 )[ 0 ]
84
82
filename = parsed_url .path .split ("/" )[- 1 ] or "index.html"
85
83
return domain , subdir , filename
0 commit comments