Skip to content

Commit feab22d

Browse files
committed
INTEGRITY: Remove 'obsolete' fileset status entirely.
1 parent ff9934f commit feab22d

File tree

1 file changed

+191
-71
lines changed

1 file changed

+191
-71
lines changed

db_functions.py

Lines changed: 191 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -146,9 +146,6 @@ def insert_fileset(
146146
cursor.execute(
147147
f"UPDATE fileset SET `timestamp` = FROM_UNIXTIME(@fileset_time_last) WHERE id = {existing_entry}"
148148
)
149-
cursor.execute(
150-
f"UPDATE fileset SET status = 'detection' WHERE id = {existing_entry} AND status = 'obsolete'"
151-
)
152149
cursor.execute(f"SELECT status FROM fileset WHERE id = {existing_entry}")
153150
status = cursor.fetchone()["status"]
154151
if status == "user":
@@ -610,10 +607,6 @@ def db_insert(data_arr, username=None, skiplog=False):
610607

611608
fileset_count += 1
612609

613-
if detection:
614-
conn.cursor().execute(
615-
"UPDATE fileset SET status = 'obsolete' WHERE `timestamp` != FROM_UNIXTIME(@fileset_time_last) AND status = 'detection'"
616-
)
617610
cur = conn.cursor()
618611

619612
try:
@@ -2130,22 +2123,39 @@ def set_glk_filter_candidate_filesets(
21302123
for row in rows:
21312124
candidates.append(row["fileset_id"])
21322125

2126+
matched_candidates = []
2127+
2128+
candidates = [
2129+
candidate
2130+
for candidate in candidates
2131+
if is_candidate_by_checksize(candidate, fileset, conn)
2132+
]
2133+
2134+
for candidate in candidates:
2135+
if is_full_detection_checksum_match(candidate, fileset, conn):
2136+
matched_candidates.append(candidate)
2137+
2138+
if len(matched_candidates) != 0:
2139+
candidates = matched_candidates
2140+
21332141
return (candidates, fileset_count)
21342142

21352143

21362144
def set_filter_candidate_filesets(
21372145
fileset_id, fileset, fileset_count, transaction_id, conn
21382146
):
21392147
"""
2140-
Returns a list of candidate filesets that can be merged
2148+
Returns a list of candidate filesets that can be merged.
2149+
Performs early filtering in SQL (by engine, name, size) and then
2150+
applies checksum filtering and max-match filtering in Python.
21412151
"""
21422152
with conn.cursor() as cursor:
2143-
# Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches
21442153
fileset_count += 1
21452154
console_log_candidate_filtering(fileset_count)
2155+
2156+
# Early filter candidates using enginename, filename and size
21462157
query = """
2147-
WITH candidate_fileset AS (
2148-
SELECT fs.id AS fileset_id, f.name, f.size
2158+
SELECT fs.id AS fileset_id, f.id AS file_id, f.name, f.size
21492159
FROM file f
21502160
JOIN fileset fs ON f.fileset = fs.id
21512161
JOIN game g ON g.id = fs.game
@@ -2154,65 +2164,94 @@ def set_filter_candidate_filesets(
21542164
WHERE e.engineid = %s
21552165
AND f.detection = 1
21562166
AND t.transaction != %s
2157-
),
2158-
total_detection_files AS (
2159-
SELECT cf.fileset_id, COUNT(*) AS detection_files_found
2160-
FROM candidate_fileset cf
2161-
GROUP BY fileset_id
2162-
),
2163-
set_fileset AS (
2164-
SELECT name, size FROM file
2165-
WHERE fileset = %s
2166-
),
2167-
matched_detection_files AS (
2168-
SELECT cf.fileset_id, COUNT(*) AS match_files_count
2169-
FROM candidate_fileset cf
2170-
JOIN set_fileset sf ON ( (
2171-
cf.name = sf.name
2172-
OR
2173-
REGEXP_REPLACE(cf.name, '^.*[\\\\/]', '') = REGEXP_REPLACE(sf.name, '^.*[\\\\/]', '')
2174-
) AND (cf.size = sf.size OR cf.size = -1) )
2175-
GROUP BY cf.fileset_id
2176-
),
2177-
valid_matched_detection_files AS (
2178-
SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
2179-
FROM matched_detection_files mdf
2180-
JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
2181-
WHERE tdf.detection_files_found <= mdf.match_files_count
2182-
),
2183-
max_match_count AS (
2184-
SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
2185-
)
2186-
SELECT vmdf.fileset_id
2187-
FROM valid_matched_detection_files vmdf
2188-
JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
2189-
JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
21902167
"""
2191-
2192-
cursor.execute(query, (fileset["sourcefile"], transaction_id, fileset_id))
2193-
rows = cursor.fetchall()
2194-
2195-
candidates = []
2196-
if rows:
2197-
for row in rows:
2198-
candidates.append(row["fileset_id"])
2199-
2200-
matched_candidates = []
2201-
2202-
candidates = [
2203-
candidate
2204-
for candidate in candidates
2205-
if is_candidate_by_checksize(candidate, fileset, conn)
2206-
]
2207-
2208-
for candidate in candidates:
2209-
if is_full_detection_checksum_match(candidate, fileset, conn):
2210-
matched_candidates.append(candidate)
2211-
2212-
if len(matched_candidates) != 0:
2213-
candidates = matched_candidates
2214-
2215-
return (candidates, fileset_count)
2168+
cursor.execute(query, (fileset["sourcefile"], transaction_id))
2169+
raw_candidates = cursor.fetchall()
2170+
2171+
# fileset id to detection files map
2172+
candidate_map = defaultdict(list)
2173+
total_detection_files_map = defaultdict(int)
2174+
for row in raw_candidates:
2175+
candidate_map[row["fileset_id"]].append(
2176+
{
2177+
"file_id": row["file_id"],
2178+
"name": row["name"],
2179+
"size": row["size"],
2180+
}
2181+
)
2182+
for id, files in candidate_map.items():
2183+
total_detection_files_map[id] = len(files)
2184+
2185+
set_checksums = set()
2186+
set_file_name_size = set()
2187+
for file in fileset["rom"]:
2188+
for key in file:
2189+
if key.startswith("md5"):
2190+
name = os.path.basename(normalised_path(file["name"]))
2191+
set_checksums.add((file[key], name.lower(), int(file["size"])))
2192+
set_checksums.add((file[key], name.lower(), -1))
2193+
set_file_name_size.add((name.lower(), -1))
2194+
set_file_name_size.add((name.lower(), int(file["size"])))
2195+
2196+
# Filter candidates by detection filename and file size (including -1) and increase matched file count
2197+
# if filesize = -1,
2198+
# elif filesize <= checksize and checksum matches,
2199+
# elif filesize > checksize.
2200+
match_counts = {}
2201+
for fileset_id, files in candidate_map.items():
2202+
count = 0
2203+
with conn.cursor() as cursor:
2204+
for f in files:
2205+
filename = os.path.basename(f["name"]).lower()
2206+
filesize = f["size"]
2207+
if (filename, filesize) in set_file_name_size:
2208+
if filesize == -1:
2209+
count += 1
2210+
else:
2211+
cursor.execute(
2212+
"""
2213+
SELECT checksum, checksize, checktype
2214+
FROM filechecksum
2215+
WHERE file = %s
2216+
""",
2217+
(f["file_id"],),
2218+
)
2219+
checksums = cursor.fetchall()
2220+
not_inc_count = False
2221+
for c in checksums:
2222+
checksum = c["checksum"]
2223+
checksize = c["checksize"]
2224+
if checksize == "1M":
2225+
checksize = 1048576
2226+
elif checksize == "0":
2227+
checksize = filesize
2228+
if filesize <= int(checksize):
2229+
if (checksum, filename, filesize) in set_checksums:
2230+
count += 1
2231+
not_inc_count = True
2232+
# if it was a true match, checksum should be present
2233+
break
2234+
if not not_inc_count:
2235+
count += 1
2236+
if count > 0 and total_detection_files_map[fileset_id] <= count:
2237+
match_counts[fileset_id] = count
2238+
2239+
# Filter only entries with maximum number of matched files
2240+
if not match_counts:
2241+
return ([], fileset_count)
2242+
2243+
max_match = max(match_counts.values())
2244+
candidates = [fid for fid, count in match_counts.items() if count == max_match]
2245+
2246+
matched_candidates = []
2247+
for candidate in candidates:
2248+
if is_full_detection_checksum_match(candidate, fileset, conn):
2249+
matched_candidates.append(candidate)
2250+
2251+
if len(matched_candidates) != 0:
2252+
candidates = matched_candidates
2253+
2254+
return (candidates, fileset_count)
22162255

22172256

22182257
def is_candidate_by_checksize(candidate, fileset, conn):
@@ -2826,6 +2865,87 @@ def finalize_fileset_insertion(
28262865
)
28272866

28282867

2868+
# def user_integrity_check(data, ip, game_metadata=None):
2869+
# src = "user"
2870+
# source_status = src
2871+
# new_files = []
2872+
2873+
# for file in data["files"]:
2874+
# new_file = {
2875+
# "name": file["name"],
2876+
# "size": file["size"],
2877+
# "size-r": file["size-r"] if "size-r" in file else 0,
2878+
# "size-rd": file["size-rd"] if "size-rd" in file else 0,
2879+
# }
2880+
# for checksum in file["checksums"]:
2881+
# checksum_type = checksum["type"]
2882+
# checksum_value = checksum["checksum"]
2883+
# new_file[checksum_type] = checksum_value
2884+
2885+
# new_files.append(new_file)
2886+
2887+
# data["rom"] = new_files
2888+
# key = calc_key(data)
2889+
# try:
2890+
# conn = db_connect()
2891+
# except Exception as e:
2892+
# print(f"Failed to connect to database: {e}")
2893+
# return
2894+
2895+
# conn.cursor().execute(f"SET @fileset_time_last = {int(time.time())}")
2896+
2897+
2898+
# try:
2899+
# with conn.cursor() as cursor:
2900+
# cursor.execute("SELECT MAX(`transaction`) FROM transactions")
2901+
# transaction_id = cursor.fetchone()["MAX(`transaction`)"] + 1
2902+
2903+
# category_text = f"Uploaded from {src}"
2904+
# log_text = f"Started loading file, State {source_status}. Transaction: {transaction_id}"
2905+
2906+
# user = f"cli:{getpass.getuser()}"
2907+
2908+
# create_log(
2909+
# escape_string(category_text), user, escape_string(log_text), conn
2910+
# )
2911+
2912+
# extra_map = defaultdict(list)
2913+
# missing_map = defaultdict(list)
2914+
# extra_set = set()
2915+
# missing_set = set()
2916+
2917+
# filesets_check_for_full = set()
2918+
# for rom in data["rom"]:
2919+
# pre_update_files(rom, filesets_check_for_full, transaction_id, conn)
2920+
2921+
# update_status_for_partial_filesets(list(filesets_check_for_full), conn)
2922+
2923+
# candidate_filesets = scan_filter_candidate_filesets()
2924+
2925+
# if len(candidate_filesets) == 0:
2926+
# insert_new_fileset(
2927+
# data, conn, None, src, key, None, transaction_id, log_text, user, ip
2928+
# )
2929+
# # return matched_map, missing_map, extra_map
2930+
2931+
2932+
# except Exception as e:
2933+
# conn.rollback()
2934+
# print(f"Error processing user data: {e}")
2935+
# finally:
2936+
# category_text = f"Uploaded from {src}"
2937+
# log_text = f"Completed loading file, State {source_status}. Transaction: {transaction_id}"
2938+
# create_log(escape_string(category_text), user, escape_string(log_text), conn)
2939+
# conn.close()
2940+
# # return matched_map, missing_map, extra_map
2941+
2942+
2943+
def update_status_for_partial_filesets(fileset_list, conn):
2944+
"""
2945+
Updates the status of the given filesets from partial to full, if all of their files have full checksums.
2946+
"""
2947+
2948+
28292949
def user_integrity_check(data, ip, game_metadata=None):
28302950
src = "user"
28312951
source_status = src
@@ -2835,8 +2955,8 @@ def user_integrity_check(data, ip, game_metadata=None):
28352955
new_file = {
28362956
"name": file["name"],
28372957
"size": file["size"],
2838-
"size-r": file["size-r"],
2839-
"size-rd": file["size-rd"],
2958+
"size-r": file["size-r"] if "size-r" in file else 0,
2959+
"size-rd": file["size-rd"] if "size-rd" in file else 0,
28402960
}
28412961
for checksum in file["checksums"]:
28422962
checksum_type = checksum["type"]

0 commit comments

Comments
 (0)