Skip to content

Commit 493acb5

Browse files
committed
INTEGRITY: Traverse set.dat instead of candidate fileset while searching mismatched files.
1 parent 90ffe1a commit 493acb5

File tree

1 file changed

+26
-23
lines changed

1 file changed

+26
-23
lines changed

db_functions.py

Lines changed: 26 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -281,8 +281,6 @@ def add_all_equal_checksums(checksize, checktype, checksum, file_id, conn):
281281
size_name = "size"
282282
if checktype[-1] == "r":
283283
size_name += "-rd"
284-
if checktype[-1] == "s":
285-
size_name += "-d"
286284

287285
cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = {file_id}")
288286
result = cursor.fetchone()
@@ -1345,7 +1343,6 @@ def update_all_files(fileset, candidate_fileset_id, is_candidate_detection, conn
13451343
`size-rd` = %s
13461344
"""
13471345
sizes = filepath_to_sizes_map[filepath]
1348-
print(sizes)
13491346
if is_candidate_detection:
13501347
query += ",name = %s WHERE id = %s"
13511348
params = (
@@ -1462,10 +1459,10 @@ def scan_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn):
14621459

14631460
def get_unmatched_files(candidate_fileset, fileset, conn):
14641461
"""
1465-
Checks if all checksums from candidate_fileset match scan file checksums.
1462+
Checks if all checksums from candidate_fileset match dat file checksums.
14661463
Returns:
14671464
unmatched_candidate_files: candidate files whose checksums weren't found in scan
1468-
unmatched_scan_files: scan files whose checksums weren't matched by candidate
1465+
unmatched_dat_files: dat files whose checksums weren't matched by candidate
14691466
"""
14701467
with conn.cursor() as cursor:
14711468
cursor.execute(
@@ -1474,18 +1471,18 @@ def get_unmatched_files(candidate_fileset, fileset, conn):
14741471
candidate_file_rows = cursor.fetchall()
14751472
candidate_files = {row["id"]: row["name"] for row in candidate_file_rows}
14761473

1477-
scan_checksums = set()
1478-
scan_names_by_checksum = {}
1474+
dat_checksums = set()
1475+
dat_names_by_checksum = {}
14791476

14801477
for file in fileset["rom"]:
14811478
base_name = os.path.basename(normalised_path(file["name"])).lower()
14821479
for key in file:
14831480
if key.startswith("md5"):
1484-
scan_checksums.add((file[key], base_name))
1485-
scan_names_by_checksum[(file[key], base_name)] = file["name"]
1481+
dat_checksums.add((file[key], base_name))
1482+
dat_names_by_checksum[(file[key], base_name)] = file["name"]
14861483

14871484
unmatched_candidate_files = []
1488-
matched_scan_pairs = set()
1485+
matched_dat_pairs = set()
14891486

14901487
for file_id, file_name in candidate_files.items():
14911488
cursor.execute(
@@ -1498,21 +1495,21 @@ def get_unmatched_files(candidate_fileset, fileset, conn):
14981495

14991496
for row in checksum_rows:
15001497
checksum = row["checksum"]
1501-
if (checksum, base_name) in scan_checksums:
1502-
matched_scan_pairs.add((checksum, base_name))
1498+
if (checksum, base_name) in dat_checksums:
1499+
matched_dat_pairs.add((checksum, base_name))
15031500
match_found = True
15041501

15051502
if not match_found:
15061503
unmatched_candidate_files.append(file_name)
15071504

1508-
unmatched_scan_files = {
1509-
scan_names_by_checksum[key]
1510-
for key in scan_checksums
1511-
if key not in matched_scan_pairs
1505+
unmatched_dat_files = {
1506+
dat_names_by_checksum[key]
1507+
for key in dat_checksums
1508+
if key not in matched_dat_pairs
15121509
}
1513-
unmatched_scan_files = list(unmatched_scan_files)
1510+
unmatched_dat_files = list(unmatched_dat_files)
15141511

1515-
return (unmatched_candidate_files, unmatched_scan_files)
1512+
return (unmatched_candidate_files, unmatched_dat_files)
15161513

15171514

15181515
def is_full_detection_checksum_match(candidate_fileset, fileset, conn):
@@ -1524,7 +1521,7 @@ def is_full_detection_checksum_match(candidate_fileset, fileset, conn):
15241521
"""
15251522
with conn.cursor() as cursor:
15261523
cursor.execute(
1527-
"SELECT id, name FROM file WHERE detection=1 AND fileset = %s",
1524+
"SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name FROM file WHERE detection=1 AND fileset = %s",
15281525
(candidate_fileset,),
15291526
)
15301527
target_files = cursor.fetchall()
@@ -1682,7 +1679,7 @@ def set_process(
16821679
console_message = "Candidate filtering finished."
16831680
console_log(console_message)
16841681
console_message = (
1685-
f"{dropped_early_no_candidate} Filesets Dropped - No candidates found."
1682+
f"{dropped_early_no_candidate} Filesets Dropped for No candidates."
16861683
)
16871684
console_log(console_message)
16881685
console_message = "Looking for duplicates..."
@@ -1872,9 +1869,15 @@ def set_perform_match(
18721869
matched_fileset_id, manual_merge_map, set_to_candidate_dict, conn
18731870
)
18741871
elif status == "partial" or status == "full":
1875-
(is_match, unmatched_files) = is_full_checksum_match(
1872+
(unmatched_candidate_files, unmatched_dat_files) = get_unmatched_files(
18761873
matched_fileset_id, fileset, conn
18771874
)
1875+
is_match = (
1876+
True
1877+
if len(unmatched_candidate_files) == 0
1878+
and len(unmatched_dat_files) == 0
1879+
else False
1880+
)
18781881
if is_match:
18791882
category_text = "Already present"
18801883
log_text = f"Already present as - Fileset:{matched_fileset_id}. Deleting Fileset:{fileset_id}"
@@ -1890,7 +1893,8 @@ def set_perform_match(
18901893

18911894
else:
18921895
category_text = "Mismatch"
1893-
log_text = f"Fileset:{fileset_id} mismatched with Fileset:{matched_fileset_id} with status:{status}. Try manual merge."
1896+
log_text = f"Fileset:{fileset_id} mismatched with Fileset:{matched_fileset_id} with status:{status}. Try manual merge. Unmatched Files in set.dat fileset = {len(unmatched_dat_files)} Unmatched Files in candidate fileset = {len(unmatched_candidate_files)}. List of unmatched files scan.dat : {', '.join(scan_file for scan_file in unmatched_dat_files)}, List of unmatched files full fileset : {', '.join(scan_file for scan_file in unmatched_candidate_files)}"
1897+
console_log(log_text)
18941898
# print_text = f"Merge Fileset:{fileset_id} manually with Fileset:{matched_fileset_id}. Unmatched files: {len(unmatched_files)}."
18951899
mismatch_filesets += 1
18961900
add_manual_merge(
@@ -1904,7 +1908,6 @@ def set_perform_match(
19041908

19051909
elif len(candidate_filesets) > 1:
19061910
found_match = False
1907-
19081911
for candidate_fileset in candidate_filesets:
19091912
(is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn)
19101913
if is_match:

0 commit comments

Comments
 (0)