Skip to content

Commit ff9934f

Browse files
committed
INTEGRITY: Add checksum based filtering in set.dat, when possible.
1 parent 493acb5 commit ff9934f

File tree

1 file changed

+97
-37
lines changed

1 file changed

+97
-37
lines changed

db_functions.py

Lines changed: 97 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -977,7 +977,9 @@ def scan_process(
977977

978978
id_to_fileset_mapping = defaultdict(dict)
979979

980+
fileset_count = 0
980981
for fileset in game_data:
982+
console_log_file_update(fileset_count)
981983
key = calc_key(fileset)
982984
megakey = ""
983985
log_text = f"State {source_status}."
@@ -1003,9 +1005,12 @@ def scan_process(
10031005
filesets_check_for_full = set()
10041006

10051007
for rom in fileset["rom"]:
1006-
scan_update_files(rom, filesets_check_for_full, transaction_id, conn)
1008+
pre_update_files(rom, filesets_check_for_full, transaction_id, conn)
1009+
fileset_count += 1
10071010

1011+
fileset_count = 0
10081012
for fileset_id, fileset in id_to_fileset_mapping.items():
1013+
console_log_matching(fileset_count)
10091014
candidate_filesets = scan_filter_candidate_filesets(
10101015
fileset_id, fileset, transaction_id, conn
10111016
)
@@ -1047,6 +1052,7 @@ def scan_process(
10471052
conn,
10481053
skiplog,
10491054
)
1055+
fileset_count += 1
10501056

10511057
# Final log
10521058
with conn.cursor() as cursor:
@@ -1063,7 +1069,7 @@ def scan_process(
10631069
create_log(escape_string(category_text), user, escape_string(log_text), conn)
10641070

10651071

1066-
def scan_update_files(rom, filesets_check_for_full, transaction_id, conn):
1072+
def pre_update_files(rom, filesets_check_for_full, transaction_id, conn):
10671073
"""
10681074
Updates all the checksums for the files matching by a checksum and size.
10691075
"""
@@ -1074,6 +1080,9 @@ def scan_update_files(rom, filesets_check_for_full, transaction_id, conn):
10741080
checksums[key] = rom[key]
10751081

10761082
files_to_update = set()
1083+
size = rom["size"] if "size" in rom else 0
1084+
size_r = rom["size-r"] if "size-r" in rom else 0
1085+
size_rd = rom["size-rd"] if "size-rd" in rom else 0
10771086

10781087
for _, checksum in checksums.items():
10791088
query = """
@@ -1088,9 +1097,7 @@ def scan_update_files(rom, filesets_check_for_full, transaction_id, conn):
10881097
AND f.`size-rd` = %s
10891098
AND t.transaction != %s
10901099
"""
1091-
size = rom["size"] if "size" in rom else 0
1092-
size_r = rom["size-r"] if "size-r" in rom else 0
1093-
size_rd = rom["size-rd"] if "size-rd" in rom else 0
1100+
10941101
cursor.execute(query, (checksum, size, size_r, size_rd, transaction_id))
10951102
result = cursor.fetchall()
10961103
if result:
@@ -1104,12 +1111,20 @@ def scan_update_files(rom, filesets_check_for_full, transaction_id, conn):
11041111
WHERE file = %s
11051112
"""
11061113
cursor.execute(query, (file_id,))
1114+
# Update checksums
11071115
for check, checksum in checksums.items():
11081116
checksize, checktype, checksum = get_checksum_props(check, checksum)
11091117
query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)"
11101118
cursor.execute(query, (file_id, checksize, checktype, checksum))
1111-
1112-
conn.commit()
1119+
# Update sizes
1120+
query = """
1121+
UPDATE file
1122+
SET size = %s,
1123+
`size-r` = %s,
1124+
`size-rd` = %s,
1125+
WHERE id = %s
1126+
"""
1127+
cursor.execute(query, size, size_r, size_rd, file_id)
11131128

11141129

11151130
def scan_perform_match(
@@ -1907,31 +1922,7 @@ def set_perform_match(
19071922
)
19081923

19091924
elif len(candidate_filesets) > 1:
1910-
found_match = False
1911-
for candidate_fileset in candidate_filesets:
1912-
(is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn)
1913-
if is_match:
1914-
update_fileset_status(cursor, candidate_fileset, "partial")
1915-
set_populate_file(fileset, candidate_fileset, conn, detection)
1916-
auto_merged_filesets += 1
1917-
if not skiplog:
1918-
log_matched_fileset(
1919-
src,
1920-
fileset_id,
1921-
candidate_fileset,
1922-
"partial",
1923-
user,
1924-
conn,
1925-
)
1926-
delete_original_fileset(fileset_id, conn)
1927-
remove_manual_merge_if_size_mismatch(
1928-
candidate_fileset, manual_merge_map, set_to_candidate_dict, conn
1929-
)
1930-
found_match = True
1931-
break
1932-
1933-
if not found_match:
1934-
manual_merge_map[fileset_id] = candidate_filesets
1925+
manual_merge_map[fileset_id] = candidate_filesets
19351926

19361927
return (
19371928
fully_matched_filesets,
@@ -2160,8 +2151,7 @@ def set_filter_candidate_filesets(
21602151
JOIN game g ON g.id = fs.game
21612152
JOIN engine e ON e.id = g.engine
21622153
JOIN transactions t ON t.fileset = fs.id
2163-
WHERE fs.id != %s
2164-
AND e.engineid = %s
2154+
WHERE e.engineid = %s
21652155
AND f.detection = 1
21662156
AND t.transaction != %s
21672157
),
@@ -2199,19 +2189,84 @@ def set_filter_candidate_filesets(
21992189
JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
22002190
"""
22012191

2202-
cursor.execute(
2203-
query, (fileset_id, fileset["sourcefile"], transaction_id, fileset_id)
2204-
)
2192+
cursor.execute(query, (fileset["sourcefile"], transaction_id, fileset_id))
22052193
rows = cursor.fetchall()
22062194

22072195
candidates = []
22082196
if rows:
22092197
for row in rows:
22102198
candidates.append(row["fileset_id"])
22112199

2200+
matched_candidates = []
2201+
2202+
candidates = [
2203+
candidate
2204+
for candidate in candidates
2205+
if is_candidate_by_checksize(candidate, fileset, conn)
2206+
]
2207+
2208+
for candidate in candidates:
2209+
if is_full_detection_checksum_match(candidate, fileset, conn):
2210+
matched_candidates.append(candidate)
2211+
2212+
if len(matched_candidates) != 0:
2213+
candidates = matched_candidates
2214+
22122215
return (candidates, fileset_count)
22132216

22142217

2218+
def is_candidate_by_checksize(candidate, fileset, conn):
2219+
with conn.cursor() as cursor:
2220+
cursor.execute(
2221+
"SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE detection=1 AND fileset = %s",
2222+
(candidate,),
2223+
)
2224+
target_files = cursor.fetchall()
2225+
candidate_files = {
2226+
target_file["id"]: [target_file["name"], target_file["size"]]
2227+
for target_file in target_files
2228+
}
2229+
2230+
# set of (checksum, filename)
2231+
scan_checksums = set()
2232+
for file in fileset["rom"]:
2233+
for key in file:
2234+
if key.startswith("md5"):
2235+
name = os.path.basename(normalised_path(file["name"]))
2236+
scan_checksums.add((file[key], name.lower()))
2237+
2238+
for detection_file_id, [
2239+
detection_file_name,
2240+
detection_file_size,
2241+
] in candidate_files.items():
2242+
query = """
2243+
SELECT fc.checksum, fc.checksize, fc.checktype
2244+
FROM filechecksum fc
2245+
WHERE fc.file = %s
2246+
"""
2247+
cursor.execute(query, (detection_file_id,))
2248+
checksums_info = cursor.fetchall()
2249+
if checksums_info:
2250+
for checksum_info in checksums_info:
2251+
checksum = checksum_info["checksum"]
2252+
checksize = checksum_info["checksize"]
2253+
if checksize == "1M":
2254+
checksize = 1048576
2255+
if (
2256+
(
2257+
checksum,
2258+
os.path.basename(detection_file_name.lower()),
2259+
)
2260+
not in scan_checksums
2261+
and detection_file_size <= int(checksize)
2262+
and detection_file_size != -1
2263+
):
2264+
continue
2265+
else:
2266+
return True
2267+
return False
2268+
2269+
22152270
def process_fileset(
22162271
fileset,
22172272
resources,
@@ -2972,6 +3027,11 @@ def console_log_candidate_filtering(fileset_count):
29723027
sys.stdout.flush()
29733028

29743029

3030+
def console_log_file_update(fileset_count):
3031+
sys.stdout.write(f"Updating files - Fileset {fileset_count}\r")
3032+
sys.stdout.flush()
3033+
3034+
29753035
def console_log_matching(fileset_count):
29763036
sys.stdout.write(f"Performing Match - Fileset {fileset_count}\r")
29773037
sys.stdout.flush()

0 commit comments

Comments
 (0)