@@ -146,9 +146,6 @@ def insert_fileset(
146
146
cursor .execute (
147
147
f"UPDATE fileset SET `timestamp` = FROM_UNIXTIME(@fileset_time_last) WHERE id = { existing_entry } "
148
148
)
149
- cursor .execute (
150
- f"UPDATE fileset SET status = 'detection' WHERE id = { existing_entry } AND status = 'obsolete'"
151
- )
152
149
cursor .execute (f"SELECT status FROM fileset WHERE id = { existing_entry } " )
153
150
status = cursor .fetchone ()["status" ]
154
151
if status == "user" :
@@ -610,10 +607,6 @@ def db_insert(data_arr, username=None, skiplog=False):
610
607
611
608
fileset_count += 1
612
609
613
- if detection :
614
- conn .cursor ().execute (
615
- "UPDATE fileset SET status = 'obsolete' WHERE `timestamp` != FROM_UNIXTIME(@fileset_time_last) AND status = 'detection'"
616
- )
617
610
cur = conn .cursor ()
618
611
619
612
try :
@@ -2130,22 +2123,39 @@ def set_glk_filter_candidate_filesets(
2130
2123
for row in rows :
2131
2124
candidates .append (row ["fileset_id" ])
2132
2125
2126
+ matched_candidates = []
2127
+
2128
+ candidates = [
2129
+ candidate
2130
+ for candidate in candidates
2131
+ if is_candidate_by_checksize (candidate , fileset , conn )
2132
+ ]
2133
+
2134
+ for candidate in candidates :
2135
+ if is_full_detection_checksum_match (candidate , fileset , conn ):
2136
+ matched_candidates .append (candidate )
2137
+
2138
+ if len (matched_candidates ) != 0 :
2139
+ candidates = matched_candidates
2140
+
2133
2141
return (candidates , fileset_count )
2134
2142
2135
2143
2136
2144
def set_filter_candidate_filesets (
2137
2145
fileset_id , fileset , fileset_count , transaction_id , conn
2138
2146
):
2139
2147
"""
2140
- Returns a list of candidate filesets that can be merged
2148
+ Returns a list of candidate filesets that can be merged.
2149
+ Performs early filtering in SQL (by engine, name, size) and then
2150
+ applies checksum filtering and max-match filtering in Python.
2141
2151
"""
2142
2152
with conn .cursor () as cursor :
2143
- # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches
2144
2153
fileset_count += 1
2145
2154
console_log_candidate_filtering (fileset_count )
2155
+
2156
+ # Early filter candidates using enginename, filename and size
2146
2157
query = """
2147
- WITH candidate_fileset AS (
2148
- SELECT fs.id AS fileset_id, f.name, f.size
2158
+ SELECT fs.id AS fileset_id, f.id AS file_id, f.name, f.size
2149
2159
FROM file f
2150
2160
JOIN fileset fs ON f.fileset = fs.id
2151
2161
JOIN game g ON g.id = fs.game
@@ -2154,65 +2164,94 @@ def set_filter_candidate_filesets(
2154
2164
WHERE e.engineid = %s
2155
2165
AND f.detection = 1
2156
2166
AND t.transaction != %s
2157
- ),
2158
- total_detection_files AS (
2159
- SELECT cf.fileset_id, COUNT(*) AS detection_files_found
2160
- FROM candidate_fileset cf
2161
- GROUP BY fileset_id
2162
- ),
2163
- set_fileset AS (
2164
- SELECT name, size FROM file
2165
- WHERE fileset = %s
2166
- ),
2167
- matched_detection_files AS (
2168
- SELECT cf.fileset_id, COUNT(*) AS match_files_count
2169
- FROM candidate_fileset cf
2170
- JOIN set_fileset sf ON ( (
2171
- cf.name = sf.name
2172
- OR
2173
- REGEXP_REPLACE(cf.name, '^.*[\\ \\ /]', '') = REGEXP_REPLACE(sf.name, '^.*[\\ \\ /]', '')
2174
- ) AND (cf.size = sf.size OR cf.size = -1) )
2175
- GROUP BY cf.fileset_id
2176
- ),
2177
- valid_matched_detection_files AS (
2178
- SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
2179
- FROM matched_detection_files mdf
2180
- JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
2181
- WHERE tdf.detection_files_found <= mdf.match_files_count
2182
- ),
2183
- max_match_count AS (
2184
- SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
2185
- )
2186
- SELECT vmdf.fileset_id
2187
- FROM valid_matched_detection_files vmdf
2188
- JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
2189
- JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
2190
2167
"""
2191
-
2192
- cursor .execute (query , (fileset ["sourcefile" ], transaction_id , fileset_id ))
2193
- rows = cursor .fetchall ()
2194
-
2195
- candidates = []
2196
- if rows :
2197
- for row in rows :
2198
- candidates .append (row ["fileset_id" ])
2199
-
2200
- matched_candidates = []
2201
-
2202
- candidates = [
2203
- candidate
2204
- for candidate in candidates
2205
- if is_candidate_by_checksize (candidate , fileset , conn )
2206
- ]
2207
-
2208
- for candidate in candidates :
2209
- if is_full_detection_checksum_match (candidate , fileset , conn ):
2210
- matched_candidates .append (candidate )
2211
-
2212
- if len (matched_candidates ) != 0 :
2213
- candidates = matched_candidates
2214
-
2215
- return (candidates , fileset_count )
2168
+ cursor .execute (query , (fileset ["sourcefile" ], transaction_id ))
2169
+ raw_candidates = cursor .fetchall ()
2170
+
2171
+ # fileset id to detection files map
2172
+ candidate_map = defaultdict (list )
2173
+ total_detection_files_map = defaultdict (int )
2174
+ for row in raw_candidates :
2175
+ candidate_map [row ["fileset_id" ]].append (
2176
+ {
2177
+ "file_id" : row ["file_id" ],
2178
+ "name" : row ["name" ],
2179
+ "size" : row ["size" ],
2180
+ }
2181
+ )
2182
+ for id , files in candidate_map .items ():
2183
+ total_detection_files_map [id ] = len (files )
2184
+
2185
+ set_checksums = set ()
2186
+ set_file_name_size = set ()
2187
+ for file in fileset ["rom" ]:
2188
+ for key in file :
2189
+ if key .startswith ("md5" ):
2190
+ name = os .path .basename (normalised_path (file ["name" ]))
2191
+ set_checksums .add ((file [key ], name .lower (), int (file ["size" ])))
2192
+ set_checksums .add ((file [key ], name .lower (), - 1 ))
2193
+ set_file_name_size .add ((name .lower (), - 1 ))
2194
+ set_file_name_size .add ((name .lower (), int (file ["size" ])))
2195
+
2196
+ # Filter candidates by detection filename and file size (including -1) and increase matched file count
2197
+ # if filesize = -1,
2198
+ # elif filesize <= checksize and checksum matches,
2199
+ # elif filesize > checksize.
2200
+ match_counts = {}
2201
+ for fileset_id , files in candidate_map .items ():
2202
+ count = 0
2203
+ with conn .cursor () as cursor :
2204
+ for f in files :
2205
+ filename = os .path .basename (f ["name" ]).lower ()
2206
+ filesize = f ["size" ]
2207
+ if (filename , filesize ) in set_file_name_size :
2208
+ if filesize == - 1 :
2209
+ count += 1
2210
+ else :
2211
+ cursor .execute (
2212
+ """
2213
+ SELECT checksum, checksize, checktype
2214
+ FROM filechecksum
2215
+ WHERE file = %s
2216
+ """ ,
2217
+ (f ["file_id" ],),
2218
+ )
2219
+ checksums = cursor .fetchall ()
2220
+ not_inc_count = False
2221
+ for c in checksums :
2222
+ checksum = c ["checksum" ]
2223
+ checksize = c ["checksize" ]
2224
+ if checksize == "1M" :
2225
+ checksize = 1048576
2226
+ elif checksize == "0" :
2227
+ checksize = filesize
2228
+ if filesize <= int (checksize ):
2229
+ if (checksum , filename , filesize ) in set_checksums :
2230
+ count += 1
2231
+ not_inc_count = True
2232
+ # if it was a true match, checksum should be present
2233
+ break
2234
+ if not not_inc_count :
2235
+ count += 1
2236
+ if count > 0 and total_detection_files_map [fileset_id ] <= count :
2237
+ match_counts [fileset_id ] = count
2238
+
2239
+ # Filter only entries with maximum number of matched files
2240
+ if not match_counts :
2241
+ return ([], fileset_count )
2242
+
2243
+ max_match = max (match_counts .values ())
2244
+ candidates = [fid for fid , count in match_counts .items () if count == max_match ]
2245
+
2246
+ matched_candidates = []
2247
+ for candidate in candidates :
2248
+ if is_full_detection_checksum_match (candidate , fileset , conn ):
2249
+ matched_candidates .append (candidate )
2250
+
2251
+ if len (matched_candidates ) != 0 :
2252
+ candidates = matched_candidates
2253
+
2254
+ return (candidates , fileset_count )
2216
2255
2217
2256
2218
2257
def is_candidate_by_checksize (candidate , fileset , conn ):
@@ -2826,6 +2865,87 @@ def finalize_fileset_insertion(
2826
2865
)
2827
2866
2828
2867
2868
+ # def user_integrity_check(data, ip, game_metadata=None):
2869
+ # src = "user"
2870
+ # source_status = src
2871
+ # new_files = []
2872
+
2873
+ # for file in data["files"]:
2874
+ # new_file = {
2875
+ # "name": file["name"],
2876
+ # "size": file["size"],
2877
+ # "size-r": file["size-r"] if "size-r" in file else 0,
2878
+ # "size-rd": file["size-rd"] if "size-rd" in file else 0,
2879
+ # }
2880
+ # for checksum in file["checksums"]:
2881
+ # checksum_type = checksum["type"]
2882
+ # checksum_value = checksum["checksum"]
2883
+ # new_file[checksum_type] = checksum_value
2884
+
2885
+ # new_files.append(new_file)
2886
+
2887
+ # data["rom"] = new_files
2888
+ # key = calc_key(data)
2889
+ # try:
2890
+ # conn = db_connect()
2891
+ # except Exception as e:
2892
+ # print(f"Failed to connect to database: {e}")
2893
+ # return
2894
+
2895
+ # conn.cursor().execute(f"SET @fileset_time_last = {int(time.time())}")
2896
+
2897
+
2898
+ # try:
2899
+ # with conn.cursor() as cursor:
2900
+ # cursor.execute("SELECT MAX(`transaction`) FROM transactions")
2901
+ # transaction_id = cursor.fetchone()["MAX(`transaction`)"] + 1
2902
+
2903
+ # category_text = f"Uploaded from {src}"
2904
+ # log_text = f"Started loading file, State {source_status}. Transaction: {transaction_id}"
2905
+
2906
+ # user = f"cli:{getpass.getuser()}"
2907
+
2908
+ # create_log(
2909
+ # escape_string(category_text), user, escape_string(log_text), conn
2910
+ # )
2911
+
2912
+ # extra_map = defaultdict(list)
2913
+ # missing_map = defaultdict(list)
2914
+ # extra_set = set()
2915
+ # missing_set = set()
2916
+
2917
+ # filesets_check_for_full = set()
2918
+ # for rom in data["rom"]:
2919
+ # pre_update_files(rom, filesets_check_for_full, transaction_id, conn)
2920
+
2921
+ # update_status_for_partial_filesets(list(filesets_check_for_full), conn)
2922
+
2923
+ # candidate_filesets = scan_filter_candidate_filesets()
2924
+
2925
+ # if len(candidate_filesets) == 0:
2926
+ # insert_new_fileset(
2927
+ # data, conn, None, src, key, None, transaction_id, log_text, user, ip
2928
+ # )
2929
+ # # return matched_map, missing_map, extra_map
2930
+
2931
+
2932
+ # except Exception as e:
2933
+ # conn.rollback()
2934
+ # print(f"Error processing user data: {e}")
2935
+ # finally:
2936
+ # category_text = f"Uploaded from {src}"
2937
+ # log_text = f"Completed loading file, State {source_status}. Transaction: {transaction_id}"
2938
+ # create_log(escape_string(category_text), user, escape_string(log_text), conn)
2939
+ # conn.close()
2940
+ # # return matched_map, missing_map, extra_map
2941
+
2942
+
2943
+ def update_status_for_partial_filesets (fileset_list , conn ):
2944
+ """
2945
+ Updates the status of the given filesets from partial to full, if all of their files have full checksums.
2946
+ """
2947
+
2948
+
2829
2949
def user_integrity_check (data , ip , game_metadata = None ):
2830
2950
src = "user"
2831
2951
source_status = src
@@ -2835,8 +2955,8 @@ def user_integrity_check(data, ip, game_metadata=None):
2835
2955
new_file = {
2836
2956
"name" : file ["name" ],
2837
2957
"size" : file ["size" ],
2838
- "size-r" : file ["size-r" ],
2839
- "size-rd" : file ["size-rd" ],
2958
+ "size-r" : file ["size-r" ] if "size-r" in file else 0 ,
2959
+ "size-rd" : file ["size-rd" ] if "size-rd" in file else 0 ,
2840
2960
}
2841
2961
for checksum in file ["checksums" ]:
2842
2962
checksum_type = checksum ["type" ]
0 commit comments