INTEGRITY: Remove 'obsolete' fileset status entirely.

ShivangNagta · ShivangNagta · commit feab22d96ca4 · 2025-07-17T17:26:18.000+05:30
diff --git a/db_functions.py b/db_functions.py
@@ -146,9 +146,6 @@ def insert_fileset(
             cursor.execute(
                 f"UPDATE fileset SET `timestamp` = FROM_UNIXTIME(@fileset_time_last) WHERE id = {existing_entry}"
             )
-            cursor.execute(
-                f"UPDATE fileset SET status = 'detection' WHERE id = {existing_entry} AND status = 'obsolete'"
-            )
             cursor.execute(f"SELECT status FROM fileset WHERE id = {existing_entry}")
             status = cursor.fetchone()["status"]
         if status == "user":
@@ -610,10 +607,6 @@ def db_insert(data_arr, username=None, skiplog=False):
 
         fileset_count += 1
 
-    if detection:
-        conn.cursor().execute(
-            "UPDATE fileset SET status = 'obsolete' WHERE `timestamp` != FROM_UNIXTIME(@fileset_time_last) AND status = 'detection'"
-        )
     cur = conn.cursor()
 
     try:
@@ -2130,22 +2123,39 @@ def set_glk_filter_candidate_filesets(
             for row in rows:
                 candidates.append(row["fileset_id"])
 
+        matched_candidates = []
+
+        candidates = [
+            candidate
+            for candidate in candidates
+            if is_candidate_by_checksize(candidate, fileset, conn)
+        ]
+
+        for candidate in candidates:
+            if is_full_detection_checksum_match(candidate, fileset, conn):
+                matched_candidates.append(candidate)
+
+        if len(matched_candidates) != 0:
+            candidates = matched_candidates
+
         return (candidates, fileset_count)
 
 
 def set_filter_candidate_filesets(
     fileset_id, fileset, fileset_count, transaction_id, conn
 ):
     """
-    Returns a list of candidate filesets that can be merged
+    Returns a list of candidate filesets that can be merged.
+    Performs early filtering in SQL (by engine, name, size) and then
+    applies checksum filtering and max-match filtering in Python.
     """
     with conn.cursor() as cursor:
-        # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches
         fileset_count += 1
         console_log_candidate_filtering(fileset_count)
+
+        # Early filter candidates using enginename, filename and size
         query = """
-            WITH candidate_fileset AS ( 
-            SELECT fs.id AS fileset_id, f.name, f.size
+            SELECT fs.id AS fileset_id, f.id AS file_id, f.name, f.size
             FROM file f
             JOIN fileset fs ON f.fileset = fs.id
             JOIN game g ON g.id = fs.game
@@ -2154,65 +2164,94 @@ def set_filter_candidate_filesets(
             WHERE e.engineid = %s
             AND f.detection = 1
             AND t.transaction != %s
-            ),
-            total_detection_files AS (
-            SELECT cf.fileset_id, COUNT(*) AS detection_files_found
-            FROM candidate_fileset cf
-            GROUP BY fileset_id
-            ),
-            set_fileset AS (
-            SELECT name, size FROM file
-            WHERE fileset = %s
-            ),
-            matched_detection_files AS (
-            SELECT cf.fileset_id, COUNT(*) AS match_files_count
-            FROM candidate_fileset cf
-            JOIN set_fileset sf ON ( (
-                cf.name = sf.name
-                OR
-                REGEXP_REPLACE(cf.name, '^.*[\\\\/]', '') = REGEXP_REPLACE(sf.name, '^.*[\\\\/]', '')
-            ) AND (cf.size = sf.size OR cf.size = -1) )
-            GROUP BY cf.fileset_id
-            ),
-            valid_matched_detection_files AS (
-            SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
-            FROM matched_detection_files mdf
-            JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
-            WHERE tdf.detection_files_found <= mdf.match_files_count
-            ),
-            max_match_count AS (
-                SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
-            )
-            SELECT vmdf.fileset_id
-            FROM valid_matched_detection_files vmdf
-            JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
-            JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
         """
-
-        cursor.execute(query, (fileset["sourcefile"], transaction_id, fileset_id))
-        rows = cursor.fetchall()
-
-        candidates = []
-        if rows:
-            for row in rows:
-                candidates.append(row["fileset_id"])
-
-        matched_candidates = []
-
-        candidates = [
-            candidate
-            for candidate in candidates
-            if is_candidate_by_checksize(candidate, fileset, conn)
-        ]
-
-        for candidate in candidates:
-            if is_full_detection_checksum_match(candidate, fileset, conn):
-                matched_candidates.append(candidate)
-
-        if len(matched_candidates) != 0:
-            candidates = matched_candidates
-
-        return (candidates, fileset_count)
+        cursor.execute(query, (fileset["sourcefile"], transaction_id))
+        raw_candidates = cursor.fetchall()
+
+    # fileset id to detection files map
+    candidate_map = defaultdict(list)
+    total_detection_files_map = defaultdict(int)
+    for row in raw_candidates:
+        candidate_map[row["fileset_id"]].append(
+            {
+                "file_id": row["file_id"],
+                "name": row["name"],
+                "size": row["size"],
+            }
+        )
+    for id, files in candidate_map.items():
+        total_detection_files_map[id] = len(files)
+
+    set_checksums = set()
+    set_file_name_size = set()
+    for file in fileset["rom"]:
+        for key in file:
+            if key.startswith("md5"):
+                name = os.path.basename(normalised_path(file["name"]))
+                set_checksums.add((file[key], name.lower(), int(file["size"])))
+                set_checksums.add((file[key], name.lower(), -1))
+        set_file_name_size.add((name.lower(), -1))
+        set_file_name_size.add((name.lower(), int(file["size"])))
+
+    # Filter candidates by detection filename and file size (including -1) and increase matched file count
+    # if filesize = -1,
+    # elif filesize <= checksize and checksum matches,
+    # elif filesize > checksize.
+    match_counts = {}
+    for fileset_id, files in candidate_map.items():
+        count = 0
+        with conn.cursor() as cursor:
+            for f in files:
+                filename = os.path.basename(f["name"]).lower()
+                filesize = f["size"]
+                if (filename, filesize) in set_file_name_size:
+                    if filesize == -1:
+                        count += 1
+                    else:
+                        cursor.execute(
+                            """
+                            SELECT checksum, checksize, checktype
+                            FROM filechecksum
+                            WHERE file = %s
+                        """,
+                            (f["file_id"],),
+                        )
+                        checksums = cursor.fetchall()
+                        not_inc_count = False
+                        for c in checksums:
+                            checksum = c["checksum"]
+                            checksize = c["checksize"]
+                            if checksize == "1M":
+                                checksize = 1048576
+                            elif checksize == "0":
+                                checksize = filesize
+                            if filesize <= int(checksize):
+                                if (checksum, filename, filesize) in set_checksums:
+                                    count += 1
+                                not_inc_count = True
+                                # if it was a true match, checksum should be present
+                                break
+                        if not not_inc_count:
+                            count += 1
+        if count > 0 and total_detection_files_map[fileset_id] <= count:
+            match_counts[fileset_id] = count
+
+    # Filter only entries with maximum number of matched files
+    if not match_counts:
+        return ([], fileset_count)
+
+    max_match = max(match_counts.values())
+    candidates = [fid for fid, count in match_counts.items() if count == max_match]
+
+    matched_candidates = []
+    for candidate in candidates:
+        if is_full_detection_checksum_match(candidate, fileset, conn):
+            matched_candidates.append(candidate)
+
+    if len(matched_candidates) != 0:
+        candidates = matched_candidates
+
+    return (candidates, fileset_count)
 
 
 def is_candidate_by_checksize(candidate, fileset, conn):
@@ -2826,6 +2865,87 @@ def finalize_fileset_insertion(
             )
 
 
+# def user_integrity_check(data, ip, game_metadata=None):
+#     src = "user"
+#     source_status = src
+#     new_files = []
+
+#     for file in data["files"]:
+#         new_file = {
+#             "name": file["name"],
+#             "size": file["size"],
+#             "size-r": file["size-r"] if "size-r" in file else 0,
+#             "size-rd": file["size-rd"] if "size-rd" in file else 0,
+#         }
+#         for checksum in file["checksums"]:
+#             checksum_type = checksum["type"]
+#             checksum_value = checksum["checksum"]
+#             new_file[checksum_type] = checksum_value
+
+#         new_files.append(new_file)
+
+#     data["rom"] = new_files
+#     key = calc_key(data)
+#     try:
+#         conn = db_connect()
+#     except Exception as e:
+#         print(f"Failed to connect to database: {e}")
+#         return
+
+#     conn.cursor().execute(f"SET @fileset_time_last = {int(time.time())}")
+
+
+#     try:
+#         with conn.cursor() as cursor:
+#             cursor.execute("SELECT MAX(`transaction`) FROM transactions")
+#             transaction_id = cursor.fetchone()["MAX(`transaction`)"] + 1
+
+#             category_text = f"Uploaded from {src}"
+#             log_text = f"Started loading file, State {source_status}. Transaction: {transaction_id}"
+
+#             user = f"cli:{getpass.getuser()}"
+
+#             create_log(
+#                 escape_string(category_text), user, escape_string(log_text), conn
+#             )
+
+#             extra_map = defaultdict(list)
+#             missing_map = defaultdict(list)
+#             extra_set = set()
+#             missing_set = set()
+
+#             filesets_check_for_full = set()
+#             for rom in data["rom"]:
+#                 pre_update_files(rom, filesets_check_for_full, transaction_id, conn)
+
+#             update_status_for_partial_filesets(list(filesets_check_for_full), conn)
+
+#             candidate_filesets = scan_filter_candidate_filesets()
+
+#             if len(candidate_filesets) == 0:
+#                 insert_new_fileset(
+#                     data, conn, None, src, key, None, transaction_id, log_text, user, ip
+#                 )
+#                 # return matched_map, missing_map, extra_map
+
+
+#     except Exception as e:
+#         conn.rollback()
+#         print(f"Error processing user data: {e}")
+#     finally:
+#         category_text = f"Uploaded from {src}"
+#         log_text = f"Completed loading file, State {source_status}. Transaction: {transaction_id}"
+#         create_log(escape_string(category_text), user, escape_string(log_text), conn)
+#         conn.close()
+#     # return matched_map, missing_map, extra_map
+
+
+def update_status_for_partial_filesets(fileset_list, conn):
+    """
+    Updates the status of the given filesets from partial to full, if all of their files have full checksums.
+    """
+
+
 def user_integrity_check(data, ip, game_metadata=None):
     src = "user"
     source_status = src
@@ -2835,8 +2955,8 @@ def user_integrity_check(data, ip, game_metadata=None):
         new_file = {
             "name": file["name"],
             "size": file["size"],
-            "size-r": file["size-r"],
-            "size-rd": file["size-rd"],
+            "size-r": file["size-r"] if "size-r" in file else 0,
+            "size-rd": file["size-rd"] if "size-rd" in file else 0,
         }
         for checksum in file["checksums"]:
             checksum_type = checksum["type"]