Skip to content

Commit a303645

Browse files
committed
INTEGRITY: Add checksum based filtering before filtering by maximum number of files matched.
1 parent ca9d4a7 commit a303645

File tree

1 file changed

+93
-62
lines changed

1 file changed

+93
-62
lines changed

db_functions.py

Lines changed: 93 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -2130,15 +2130,17 @@ def set_filter_candidate_filesets(
21302130
fileset_id, fileset, fileset_count, transaction_id, conn
21312131
):
21322132
"""
2133-
Returns a list of candidate filesets that can be merged
2133+
Returns a list of candidate filesets that can be merged.
2134+
Performs early filtering in SQL (by engine, name, size) and then
2135+
applies checksum filtering and max-match filtering in Python.
21342136
"""
21352137
with conn.cursor() as cursor:
2136-
# Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches
21372138
fileset_count += 1
21382139
console_log_candidate_filtering(fileset_count)
2140+
2141+
# Early filter candidates using enginename, filename and size
21392142
query = """
2140-
WITH candidate_fileset AS (
2141-
SELECT fs.id AS fileset_id, f.name, f.size
2143+
SELECT fs.id AS fileset_id, f.id AS file_id, f.name, f.size
21422144
FROM file f
21432145
JOIN fileset fs ON f.fileset = fs.id
21442146
JOIN game g ON g.id = fs.game
@@ -2147,65 +2149,94 @@ def set_filter_candidate_filesets(
21472149
WHERE e.engineid = %s
21482150
AND f.detection = 1
21492151
AND t.transaction != %s
2150-
),
2151-
total_detection_files AS (
2152-
SELECT cf.fileset_id, COUNT(*) AS detection_files_found
2153-
FROM candidate_fileset cf
2154-
GROUP BY fileset_id
2155-
),
2156-
set_fileset AS (
2157-
SELECT name, size FROM file
2158-
WHERE fileset = %s
2159-
),
2160-
matched_detection_files AS (
2161-
SELECT cf.fileset_id, COUNT(*) AS match_files_count
2162-
FROM candidate_fileset cf
2163-
JOIN set_fileset sf ON ( (
2164-
cf.name = sf.name
2165-
OR
2166-
REGEXP_REPLACE(cf.name, '^.*[\\\\/]', '') = REGEXP_REPLACE(sf.name, '^.*[\\\\/]', '')
2167-
) AND (cf.size = sf.size OR cf.size = -1) )
2168-
GROUP BY cf.fileset_id
2169-
),
2170-
valid_matched_detection_files AS (
2171-
SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
2172-
FROM matched_detection_files mdf
2173-
JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
2174-
WHERE tdf.detection_files_found <= mdf.match_files_count
2175-
),
2176-
max_match_count AS (
2177-
SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
2178-
)
2179-
SELECT vmdf.fileset_id
2180-
FROM valid_matched_detection_files vmdf
2181-
JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
2182-
JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
21832152
"""
2184-
2185-
cursor.execute(query, (fileset["sourcefile"], transaction_id, fileset_id))
2186-
rows = cursor.fetchall()
2187-
2188-
candidates = []
2189-
if rows:
2190-
for row in rows:
2191-
candidates.append(row["fileset_id"])
2192-
2193-
matched_candidates = []
2194-
2195-
candidates = [
2196-
candidate
2197-
for candidate in candidates
2198-
if is_candidate_by_checksize(candidate, fileset, conn)
2199-
]
2200-
2201-
for candidate in candidates:
2202-
if is_full_detection_checksum_match(candidate, fileset, conn):
2203-
matched_candidates.append(candidate)
2204-
2205-
if len(matched_candidates) != 0:
2206-
candidates = matched_candidates
2207-
2208-
return (candidates, fileset_count)
2153+
cursor.execute(query, (fileset["sourcefile"], transaction_id))
2154+
raw_candidates = cursor.fetchall()
2155+
2156+
# fileset id to detection files map
2157+
candidate_map = defaultdict(list)
2158+
total_detection_files_map = defaultdict(int)
2159+
for row in raw_candidates:
2160+
candidate_map[row["fileset_id"]].append(
2161+
{
2162+
"file_id": row["file_id"],
2163+
"name": row["name"],
2164+
"size": row["size"],
2165+
}
2166+
)
2167+
for id, files in candidate_map.items():
2168+
total_detection_files_map[id] = len(files)
2169+
2170+
set_checksums = set()
2171+
set_file_name_size = set()
2172+
for file in fileset["rom"]:
2173+
for key in file:
2174+
if key.startswith("md5"):
2175+
name = os.path.basename(normalised_path(file["name"]))
2176+
set_checksums.add((file[key], name.lower(), int(file["size"])))
2177+
set_checksums.add((file[key], name.lower(), -1))
2178+
set_file_name_size.add((name.lower(), -1))
2179+
set_file_name_size.add((name.lower(), int(file["size"])))
2180+
2181+
# Filter candidates by detection filename and file size (including -1) and increase matched file count
2182+
# if filesize = -1,
2183+
# elif filesize <= checksize and checksum matches,
2184+
# elif filesize > checksize.
2185+
match_counts = {}
2186+
for fileset_id, files in candidate_map.items():
2187+
count = 0
2188+
with conn.cursor() as cursor:
2189+
for f in files:
2190+
filename = os.path.basename(f["name"]).lower()
2191+
filesize = f["size"]
2192+
if (filename, filesize) in set_file_name_size:
2193+
if filesize == -1:
2194+
count += 1
2195+
else:
2196+
cursor.execute(
2197+
"""
2198+
SELECT checksum, checksize, checktype
2199+
FROM filechecksum
2200+
WHERE file = %s
2201+
""",
2202+
(f["file_id"],),
2203+
)
2204+
checksums = cursor.fetchall()
2205+
not_inc_count = False
2206+
for c in checksums:
2207+
checksum = c["checksum"]
2208+
checksize = c["checksize"]
2209+
if checksize == "1M":
2210+
checksize = 1048576
2211+
elif checksize == "0":
2212+
checksize = filesize
2213+
if filesize <= int(checksize):
2214+
if (checksum, filename, filesize) in set_checksums:
2215+
count += 1
2216+
not_inc_count = True
2217+
# if it was a true match, checksum should be present
2218+
break
2219+
if not not_inc_count:
2220+
count += 1
2221+
if count > 0 and total_detection_files_map[fileset_id] <= count:
2222+
match_counts[fileset_id] = count
2223+
2224+
# Filter only entries with maximum number of matched files
2225+
if not match_counts:
2226+
return ([], fileset_count)
2227+
2228+
max_match = max(match_counts.values())
2229+
candidates = [fid for fid, count in match_counts.items() if count == max_match]
2230+
2231+
matched_candidates = []
2232+
for candidate in candidates:
2233+
if is_full_detection_checksum_match(candidate, fileset, conn):
2234+
matched_candidates.append(candidate)
2235+
2236+
if len(matched_candidates) != 0:
2237+
candidates = matched_candidates
2238+
2239+
return (candidates, fileset_count)
22092240

22102241

22112242
def is_candidate_by_checksize(candidate, fileset, conn):

0 commit comments

Comments
 (0)