@@ -2130,15 +2130,17 @@ def set_filter_candidate_filesets(
2130
2130
fileset_id , fileset , fileset_count , transaction_id , conn
2131
2131
):
2132
2132
"""
2133
- Returns a list of candidate filesets that can be merged
2133
+ Returns a list of candidate filesets that can be merged.
2134
+ Performs early filtering in SQL (by engine, name, size) and then
2135
+ applies checksum filtering and max-match filtering in Python.
2134
2136
"""
2135
2137
with conn .cursor () as cursor :
2136
- # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches
2137
2138
fileset_count += 1
2138
2139
console_log_candidate_filtering (fileset_count )
2140
+
2141
+ # Early filter candidates using enginename, filename and size
2139
2142
query = """
2140
- WITH candidate_fileset AS (
2141
- SELECT fs.id AS fileset_id, f.name, f.size
2143
+ SELECT fs.id AS fileset_id, f.id AS file_id, f.name, f.size
2142
2144
FROM file f
2143
2145
JOIN fileset fs ON f.fileset = fs.id
2144
2146
JOIN game g ON g.id = fs.game
@@ -2147,65 +2149,94 @@ def set_filter_candidate_filesets(
2147
2149
WHERE e.engineid = %s
2148
2150
AND f.detection = 1
2149
2151
AND t.transaction != %s
2150
- ),
2151
- total_detection_files AS (
2152
- SELECT cf.fileset_id, COUNT(*) AS detection_files_found
2153
- FROM candidate_fileset cf
2154
- GROUP BY fileset_id
2155
- ),
2156
- set_fileset AS (
2157
- SELECT name, size FROM file
2158
- WHERE fileset = %s
2159
- ),
2160
- matched_detection_files AS (
2161
- SELECT cf.fileset_id, COUNT(*) AS match_files_count
2162
- FROM candidate_fileset cf
2163
- JOIN set_fileset sf ON ( (
2164
- cf.name = sf.name
2165
- OR
2166
- REGEXP_REPLACE(cf.name, '^.*[\\ \\ /]', '') = REGEXP_REPLACE(sf.name, '^.*[\\ \\ /]', '')
2167
- ) AND (cf.size = sf.size OR cf.size = -1) )
2168
- GROUP BY cf.fileset_id
2169
- ),
2170
- valid_matched_detection_files AS (
2171
- SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
2172
- FROM matched_detection_files mdf
2173
- JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
2174
- WHERE tdf.detection_files_found <= mdf.match_files_count
2175
- ),
2176
- max_match_count AS (
2177
- SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
2178
- )
2179
- SELECT vmdf.fileset_id
2180
- FROM valid_matched_detection_files vmdf
2181
- JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
2182
- JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
2183
2152
"""
2184
-
2185
- cursor .execute (query , (fileset ["sourcefile" ], transaction_id , fileset_id ))
2186
- rows = cursor .fetchall ()
2187
-
2188
- candidates = []
2189
- if rows :
2190
- for row in rows :
2191
- candidates .append (row ["fileset_id" ])
2192
-
2193
- matched_candidates = []
2194
-
2195
- candidates = [
2196
- candidate
2197
- for candidate in candidates
2198
- if is_candidate_by_checksize (candidate , fileset , conn )
2199
- ]
2200
-
2201
- for candidate in candidates :
2202
- if is_full_detection_checksum_match (candidate , fileset , conn ):
2203
- matched_candidates .append (candidate )
2204
-
2205
- if len (matched_candidates ) != 0 :
2206
- candidates = matched_candidates
2207
-
2208
- return (candidates , fileset_count )
2153
+ cursor .execute (query , (fileset ["sourcefile" ], transaction_id ))
2154
+ raw_candidates = cursor .fetchall ()
2155
+
2156
+ # fileset id to detection files map
2157
+ candidate_map = defaultdict (list )
2158
+ total_detection_files_map = defaultdict (int )
2159
+ for row in raw_candidates :
2160
+ candidate_map [row ["fileset_id" ]].append (
2161
+ {
2162
+ "file_id" : row ["file_id" ],
2163
+ "name" : row ["name" ],
2164
+ "size" : row ["size" ],
2165
+ }
2166
+ )
2167
+ for id , files in candidate_map .items ():
2168
+ total_detection_files_map [id ] = len (files )
2169
+
2170
+ set_checksums = set ()
2171
+ set_file_name_size = set ()
2172
+ for file in fileset ["rom" ]:
2173
+ for key in file :
2174
+ if key .startswith ("md5" ):
2175
+ name = os .path .basename (normalised_path (file ["name" ]))
2176
+ set_checksums .add ((file [key ], name .lower (), int (file ["size" ])))
2177
+ set_checksums .add ((file [key ], name .lower (), - 1 ))
2178
+ set_file_name_size .add ((name .lower (), - 1 ))
2179
+ set_file_name_size .add ((name .lower (), int (file ["size" ])))
2180
+
2181
+ # Filter candidates by detection filename and file size (including -1) and increase matched file count
2182
+ # if filesize = -1,
2183
+ # elif filesize <= checksize and checksum matches,
2184
+ # elif filesize > checksize.
2185
+ match_counts = {}
2186
+ for fileset_id , files in candidate_map .items ():
2187
+ count = 0
2188
+ with conn .cursor () as cursor :
2189
+ for f in files :
2190
+ filename = os .path .basename (f ["name" ]).lower ()
2191
+ filesize = f ["size" ]
2192
+ if (filename , filesize ) in set_file_name_size :
2193
+ if filesize == - 1 :
2194
+ count += 1
2195
+ else :
2196
+ cursor .execute (
2197
+ """
2198
+ SELECT checksum, checksize, checktype
2199
+ FROM filechecksum
2200
+ WHERE file = %s
2201
+ """ ,
2202
+ (f ["file_id" ],),
2203
+ )
2204
+ checksums = cursor .fetchall ()
2205
+ not_inc_count = False
2206
+ for c in checksums :
2207
+ checksum = c ["checksum" ]
2208
+ checksize = c ["checksize" ]
2209
+ if checksize == "1M" :
2210
+ checksize = 1048576
2211
+ elif checksize == "0" :
2212
+ checksize = filesize
2213
+ if filesize <= int (checksize ):
2214
+ if (checksum , filename , filesize ) in set_checksums :
2215
+ count += 1
2216
+ not_inc_count = True
2217
+ # if it was a true match, checksum should be present
2218
+ break
2219
+ if not not_inc_count :
2220
+ count += 1
2221
+ if count > 0 and total_detection_files_map [fileset_id ] <= count :
2222
+ match_counts [fileset_id ] = count
2223
+
2224
+ # Filter only entries with maximum number of matched files
2225
+ if not match_counts :
2226
+ return ([], fileset_count )
2227
+
2228
+ max_match = max (match_counts .values ())
2229
+ candidates = [fid for fid , count in match_counts .items () if count == max_match ]
2230
+
2231
+ matched_candidates = []
2232
+ for candidate in candidates :
2233
+ if is_full_detection_checksum_match (candidate , fileset , conn ):
2234
+ matched_candidates .append (candidate )
2235
+
2236
+ if len (matched_candidates ) != 0 :
2237
+ candidates = matched_candidates
2238
+
2239
+ return (candidates , fileset_count )
2209
2240
2210
2241
2211
2242
def is_candidate_by_checksize (candidate , fileset , conn ):
0 commit comments