Skip to content

Commit c9dca04

Browse files
committed
INTEGRITY: Add checksum filtering before max files filtering in scan.dat processing
1 parent 57df340 commit c9dca04

File tree

1 file changed

+130
-53
lines changed

1 file changed

+130
-53
lines changed

db_functions.py

Lines changed: 130 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1059,6 +1059,7 @@ def scan_process(
10591059
create_log(escape_string(category_text), user, escape_string(log_text), conn)
10601060
category_text = "Upload information"
10611061
log_text = f"Number of filesets: {fileset_insertion_count}. Filesets automatically merged: {automatic_merged_filesets}. Filesets requiring manual merge (multiple candidates): {manual_merged_filesets}. Filesets requiring manual merge (matched with detection): {manual_merged_with_detection}. Filesets dropped, no candidate: {dropped_early_no_candidate}. Filesets matched with existing Full fileset: {match_with_full_fileset}. Filesets with mismatched files with Full fileset: {mismatch_with_full_fileset}. Filesets missing files compared to partial fileset candidate: {filesets_with_missing_files}."
1062+
console_log(log_text)
10621063
create_log(escape_string(category_text), user, escape_string(log_text), conn)
10631064

10641065

@@ -1115,9 +1116,12 @@ def pre_update_files(rom, filesets_check_for_full, transaction_id, conn):
11151116
SET size = %s,
11161117
`size-r` = %s,
11171118
`size-rd` = %s,
1119+
name = %s
11181120
WHERE id = %s
11191121
"""
1120-
cursor.execute(query, size, size_r, size_rd, file_id)
1122+
cursor.execute(
1123+
query, (size, size_r, size_rd, normalised_path(rom["name"]), file_id)
1124+
)
11211125

11221126

11231127
def scan_perform_match(
@@ -1396,73 +1400,146 @@ def total_fileset_files(fileset):
13961400

13971401
def scan_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn):
13981402
"""
1399-
Returns a list of candidate filesets that can be merged
1403+
Returns a list of candidate filesets that can be merged.
1404+
Performs early filtering in SQL (by name, size) and then
1405+
applies checksum filtering and max-match filtering in Python.
14001406
"""
14011407
with conn.cursor() as cursor:
1402-
# Returns those filesets which have all detection files matching in the scan fileset filtered by file name and file size(if not -1).
1403-
1408+
# Fetching detection filename and all sizes (size, size-r, size-rd) from database
14041409
query = """
1405-
WITH candidate_fileset AS (
1406-
SELECT fs.id AS fileset_id, f.name, f.size,
1410+
SELECT fs.id AS fileset_id, f.id as file_id, f.name, f.size,
14071411
f.`size-r` AS size_r, f.`size-rd` AS size_rd
14081412
FROM file f
14091413
JOIN fileset fs ON f.fileset = fs.id
14101414
JOIN game g ON g.id = fs.game
1415+
JOIN engine e ON e.id = g.engine
14111416
JOIN transactions t ON t.fileset = fs.id
14121417
WHERE f.detection = 1
14131418
AND t.transaction != %s
1414-
),
1415-
total_detection_files AS (
1416-
SELECT cf.fileset_id, COUNT(*) AS detection_files_found
1417-
FROM candidate_fileset cf
1418-
GROUP BY fileset_id
1419-
),
1420-
set_fileset AS (
1421-
SELECT name, size,
1422-
`size-r` AS size_r, `size-rd` AS size_rd
1423-
FROM file
1424-
WHERE fileset = %s
1425-
),
1426-
matched_detection_files AS (
1427-
SELECT cf.fileset_id, COUNT(*) AS match_files_count
1428-
FROM candidate_fileset cf
1429-
JOIN set_fileset sf ON ( (
1430-
cf.name = sf.name
1431-
OR
1432-
REGEXP_REPLACE(cf.name, '^.*[\\\\/]', '') = REGEXP_REPLACE(sf.name, '^.*[\\\\/]', '')
1433-
) AND (cf.size = sf.size OR cf.size = -1)
1434-
AND (cf.size_r = sf.size_r)
1435-
AND (cf.size_rd = sf.size_rd))
1436-
GROUP BY cf.fileset_id
1437-
),
1438-
valid_matched_detection_files AS (
1439-
SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
1440-
FROM matched_detection_files mdf
1441-
JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
1442-
WHERE tdf.detection_files_found <= mdf.match_files_count
1443-
),
1444-
max_match_count AS (
1445-
SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
1446-
)
1447-
SELECT vmdf.fileset_id
1448-
FROM valid_matched_detection_files vmdf
1449-
JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
1450-
JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
14511419
"""
1420+
cursor.execute(query, (transaction_id,))
1421+
raw_candidates = cursor.fetchall()
1422+
1423+
# fileset id to detection files map
1424+
candidate_map = defaultdict(list)
1425+
total_detection_files_map = defaultdict(int)
1426+
for row in raw_candidates:
1427+
candidate_map[row["fileset_id"]].append(
1428+
{
1429+
"file_id": row["file_id"],
1430+
"name": os.path.basename(normalised_path(row["name"])).lower(),
1431+
"size": row["size"],
1432+
"size-r": row["size_r"],
1433+
"size-rd": row["size_rd"],
1434+
}
1435+
)
1436+
for id, files in candidate_map.items():
1437+
total_detection_files_map[id] = len(files)
1438+
1439+
set_checksums = set()
1440+
set_file_name_size = set()
1441+
for file in fileset["rom"]:
1442+
name = os.path.basename(normalised_path(file["name"]))
1443+
for key in file:
1444+
if key.startswith("md5"):
1445+
set_checksums.add(
1446+
(
1447+
file[key],
1448+
name.lower(),
1449+
int(file["size"]),
1450+
int(file["size-r"]),
1451+
int(file["size-rd"]),
1452+
)
1453+
)
1454+
set_checksums.add(
1455+
(
1456+
file[key],
1457+
name.lower(),
1458+
-1,
1459+
int(file["size-r"]),
1460+
int(file["size-rd"]),
1461+
)
1462+
)
1463+
set_file_name_size.add(
1464+
(name.lower(), -1, int(file["size-r"]), int(file["size-rd"]))
1465+
)
1466+
set_file_name_size.add(
1467+
(name.lower(), int(file["size"]), int(file["size-r"]), int(file["size-rd"]))
1468+
)
1469+
1470+
# Filter candidates by detection filename and file size (including -1) and increase matched file count
1471+
# if filesize = -1,
1472+
# elif filesize <= checksize and checksum matches,
1473+
# elif filesize > checksize.
1474+
match_counts = {}
1475+
for fileset_id, files in candidate_map.items():
1476+
count = 0
1477+
with conn.cursor() as cursor:
1478+
for f in files:
1479+
filename = os.path.basename(f["name"]).lower()
1480+
size = f["size"]
1481+
size_r = f["size-r"]
1482+
size_rd = f["size-rd"]
1483+
if (filename, size, size_r, size_rd) in set_file_name_size:
1484+
if size == -1:
1485+
count += 1
1486+
else:
1487+
cursor.execute(
1488+
"""
1489+
SELECT checksum, checksize, checktype
1490+
FROM filechecksum
1491+
WHERE file = %s
1492+
""",
1493+
(f["file_id"],),
1494+
)
1495+
checksums = cursor.fetchall()
1496+
not_inc_count = False
1497+
for c in checksums:
1498+
filesize = size
1499+
checksum = c["checksum"]
1500+
checksize = c["checksize"]
1501+
checktype = c["checktype"]
1502+
# Macfiles handling
1503+
if checktype in ["md5-r", "md5-rt"]:
1504+
filesize = size_rd
14521505

1453-
cursor.execute(query, (transaction_id, fileset_id))
1454-
rows = cursor.fetchall()
1506+
if checksize == "1M":
1507+
checksize = 1048576
1508+
elif checksize == "0":
1509+
checksize = filesize
1510+
if filesize <= int(checksize):
1511+
if (
1512+
checksum,
1513+
filename,
1514+
size,
1515+
size_r,
1516+
size_rd,
1517+
) in set_checksums:
1518+
count += 1
1519+
not_inc_count = True
1520+
# if it was a true match, checksum should be present
1521+
break
1522+
if not not_inc_count:
1523+
count += 1
1524+
if count > 0 and total_detection_files_map[fileset_id] <= count:
1525+
match_counts[fileset_id] = count
1526+
1527+
# Filter only entries with maximum number of matched files
1528+
if not match_counts:
1529+
return []
14551530

1456-
candidates = []
1457-
if rows:
1458-
for row in rows:
1459-
candidates.append(row["fileset_id"])
1531+
max_match = max(match_counts.values())
1532+
candidates = [fid for fid, count in match_counts.items() if count == max_match]
14601533

1461-
for candidate in candidates:
1462-
if not is_full_detection_checksum_match(candidate, fileset, conn):
1463-
candidates.remove(candidate)
1534+
matched_candidates = []
1535+
for candidate in candidates:
1536+
if is_full_detection_checksum_match(candidate, fileset, conn):
1537+
matched_candidates.append(candidate)
1538+
1539+
if len(matched_candidates) != 0:
1540+
candidates = matched_candidates
14641541

1465-
return candidates
1542+
return candidates
14661543

14671544

14681545
def get_unmatched_files(candidate_fileset, fileset, conn):

0 commit comments

Comments
 (0)