Skip to content

Commit 45bdfc7

Browse files
committed
INTEGRITY: Filter manual merge candidates if size mismatch.
1 parent fbbe728 commit 45bdfc7

File tree

1 file changed

+135
-34
lines changed

1 file changed

+135
-34
lines changed

db_functions.py

Lines changed: 135 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,6 +1052,8 @@ def set_process(
10521052
del set_to_candidate_dict[set_fileset]
10531053
del id_to_fileset_dict[set_fileset]
10541054

1055+
manual_merge_map = defaultdict(list)
1056+
10551057
for fileset_id, candidate_filesets in set_to_candidate_dict.items():
10561058
fileset = id_to_fileset_dict[fileset_id]
10571059

@@ -1060,16 +1062,6 @@ def set_process(
10601062
fileset["name"], candidate_filesets, conn
10611063
)
10621064

1063-
for candidate_fileset in candidate_filesets:
1064-
with conn.cursor() as cursor:
1065-
cursor.execute(
1066-
"SELECT id FROM fileset WHERE status = 'current' AND id = %s",
1067-
(candidate_fileset),
1068-
)
1069-
result = cursor.fetchone()
1070-
if result:
1071-
candidate_filesets.remove(candidate_fileset)
1072-
10731065
(
10741066
fully_matched_filesets,
10751067
auto_merged_filesets,
@@ -1086,14 +1078,31 @@ def set_process(
10861078
auto_merged_filesets,
10871079
manual_merged_filesets,
10881080
mismatch_filesets,
1081+
manual_merge_map,
1082+
set_to_candidate_dict,
10891083
conn,
10901084
skiplog,
10911085
)
10921086

1087+
# print(manual_merge_map)
1088+
1089+
for fileset_id, candidates in manual_merge_map.items():
1090+
category_text = "Manual Merge Required"
1091+
log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidates)}."
1092+
manual_merged_filesets += 1
1093+
# print(candidates)
1094+
add_manual_merge(
1095+
candidates,
1096+
fileset_id,
1097+
category_text,
1098+
log_text,
1099+
log_text,
1100+
user,
1101+
conn,
1102+
)
1103+
10931104
# Final log
10941105
with conn.cursor() as cursor:
1095-
cursor.execute("UPDATE fileset SET status = 'partial' WHERE status = 'current'")
1096-
10971106
cursor.execute(
10981107
"SELECT COUNT(fileset) from transactions WHERE `transaction` = %s",
10991108
(transaction_id,),
@@ -1156,6 +1165,8 @@ def set_perform_match(
11561165
auto_merged_filesets,
11571166
manual_merged_filesets,
11581167
mismatch_filesets,
1168+
manual_merge_map,
1169+
set_to_candidate_dict,
11591170
conn,
11601171
skiplog,
11611172
):
@@ -1170,7 +1181,7 @@ def set_perform_match(
11701181
)
11711182
status = cursor.fetchone()["status"]
11721183
if status == "detection":
1173-
update_fileset_status(cursor, matched_fileset_id, "current")
1184+
update_fileset_status(cursor, matched_fileset_id, "parital")
11741185
set_populate_file(fileset, matched_fileset_id, conn, detection)
11751186
auto_merged_filesets += 1
11761187
if not skiplog:
@@ -1183,6 +1194,9 @@ def set_perform_match(
11831194
conn,
11841195
)
11851196
delete_original_fileset(fileset_id, conn)
1197+
remove_manual_merge_if_size_mismatch(
1198+
matched_fileset_id, manual_merge_map, set_to_candidate_dict, conn
1199+
)
11861200
elif status == "partial" or status == "full":
11871201
(is_match, unmatched_files) = is_full_checksum_match(
11881202
matched_fileset_id, fileset, conn
@@ -1221,7 +1235,7 @@ def set_perform_match(
12211235
for candidate_fileset in candidate_filesets:
12221236
(is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn)
12231237
if is_match:
1224-
update_fileset_status(cursor, candidate_fileset, "current")
1238+
update_fileset_status(cursor, candidate_fileset, "partial")
12251239
set_populate_file(fileset, candidate_fileset, conn, detection)
12261240
auto_merged_filesets += 1
12271241
if not skiplog:
@@ -1234,22 +1248,14 @@ def set_perform_match(
12341248
conn,
12351249
)
12361250
delete_original_fileset(fileset_id, conn)
1251+
remove_manual_merge_if_size_mismatch(
1252+
candidate_fileset, manual_merge_map, set_to_candidate_dict, conn
1253+
)
12371254
found_match = True
12381255
break
12391256

12401257
if not found_match:
1241-
category_text = "Manual Merge Required"
1242-
log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidate_filesets)}."
1243-
manual_merged_filesets += 1
1244-
add_manual_merge(
1245-
candidate_filesets,
1246-
fileset_id,
1247-
category_text,
1248-
log_text,
1249-
log_text,
1250-
user,
1251-
conn,
1252-
)
1258+
manual_merge_map[fileset_id] = candidate_filesets
12531259

12541260
return (
12551261
fully_matched_filesets,
@@ -1259,6 +1265,98 @@ def set_perform_match(
12591265
)
12601266

12611267

1268+
def remove_manual_merge_if_size_mismatch(
1269+
child_fileset, manual_merge_map, set_to_candidate_dict, conn
1270+
):
1271+
with conn.cursor() as cursor:
1272+
query = """
1273+
SELECT f.name, f.size
1274+
FROM fileset fs
1275+
JOIN file f ON f.fileset = fs.id
1276+
WHERE fs.id = %s
1277+
AND f.detection = 1
1278+
"""
1279+
cursor.execute(query, (child_fileset,))
1280+
files = cursor.fetchall()
1281+
1282+
for parent_fileset, child_list in manual_merge_map.items():
1283+
if child_fileset not in child_list:
1284+
continue
1285+
1286+
for file in files:
1287+
if file["size"] == -1:
1288+
continue
1289+
1290+
query = """
1291+
SELECT f.id
1292+
FROM fileset fs
1293+
JOIN file f ON f.fileset = fs.id
1294+
WHERE fs.id = %s
1295+
AND f.name = %s
1296+
AND f.size = %s
1297+
"""
1298+
cursor.execute(query, (parent_fileset, file["name"], file["size"]))
1299+
result = cursor.fetchall()
1300+
1301+
if not result:
1302+
remove_manual_merge(
1303+
child_fileset,
1304+
parent_fileset,
1305+
manual_merge_map,
1306+
set_to_candidate_dict,
1307+
conn,
1308+
)
1309+
break
1310+
1311+
for parent_fileset, child_list in set_to_candidate_dict.items():
1312+
if child_fileset not in child_list:
1313+
continue
1314+
1315+
for file in files:
1316+
if file["size"] == -1:
1317+
continue
1318+
1319+
query = """
1320+
SELECT f.id
1321+
FROM fileset fs
1322+
JOIN file f ON f.fileset = fs.id
1323+
WHERE fs.id = %s
1324+
AND f.name = %s
1325+
AND f.size = %s
1326+
"""
1327+
cursor.execute(query, (parent_fileset, file["name"], file["size"]))
1328+
result = cursor.fetchall()
1329+
1330+
if not result:
1331+
remove_manual_merge(
1332+
child_fileset,
1333+
parent_fileset,
1334+
manual_merge_map,
1335+
set_to_candidate_dict,
1336+
conn,
1337+
)
1338+
break
1339+
1340+
1341+
def remove_manual_merge(
1342+
child_fileset, parent_fileset, manual_merge_map, set_to_candidate_dict, conn
1343+
):
1344+
if parent_fileset in manual_merge_map:
1345+
if child_fileset in manual_merge_map[parent_fileset]:
1346+
manual_merge_map[parent_fileset].remove(child_fileset)
1347+
if parent_fileset in set_to_candidate_dict:
1348+
if child_fileset in set_to_candidate_dict[parent_fileset]:
1349+
set_to_candidate_dict[parent_fileset].remove(child_fileset)
1350+
1351+
with conn.cursor() as cursor:
1352+
query = """
1353+
DELETE FROM possible_merges
1354+
WHERE child_fileset = %s
1355+
AND parent_fileset = %s
1356+
"""
1357+
cursor.execute(query, (child_fileset, parent_fileset))
1358+
1359+
12621360
def add_manual_merge(
12631361
child_filesets, parent_fileset, category_text, log_text, print_text, user, conn
12641362
):
@@ -1835,15 +1933,18 @@ def set_populate_file(fileset, fileset_id, conn, detection):
18351933

18361934
filename = os.path.basename(normalised_path(file["name"]))
18371935

1838-
if (engine_name == "glk" and file["size"] not in candidate_file_size) and (
1839-
(filename.lower(), file["size"]) in seen_detection_files
1840-
or (
1841-
filename.lower() not in candidate_files
1936+
if (engine_name == "glk" and file["size"] not in candidate_file_size) or (
1937+
engine_name != "glk"
1938+
and (
1939+
(filename.lower(), file["size"]) in seen_detection_files
18421940
or (
1843-
filename.lower() in candidate_files
1844-
and (
1845-
candidate_files[filename.lower()][1] != -1
1846-
and candidate_files[filename.lower()][1] != file["size"]
1941+
filename.lower() not in candidate_files
1942+
or (
1943+
filename.lower() in candidate_files
1944+
and (
1945+
candidate_files[filename.lower()][1] != -1
1946+
and candidate_files[filename.lower()][1] != file["size"]
1947+
)
18471948
)
18481949
)
18491950
)

0 commit comments

Comments
 (0)