From 161abd5cb33b739fa40618768a9349634a03ad58 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Fri, 4 Jul 2025 19:52:16 +0530
Subject: [PATCH 01/30] INTEGRITY: Increase character limit size for log text

---
 schema.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/schema.py b/schema.py
index 09cbe7f..2ebfeeb 100644
--- a/schema.py
+++ b/schema.py
@@ -202,6 +202,10 @@
     cursor.execute(
         "ALTER TABLE file MODIFY COLUMN `size-r` BIGINT DEFAULT 0, MODIFY COLUMN `size-rd` BIGINT DEFAULT 0;"
     )
+try:
+    cursor.execute("ALTER TABLE log ADD COLUMN `text` varchar(1000);")
+except Exception:
+    cursor.execute("ALTER TABLE log MODIFY COLUMN `text` varchar(1000);")
 
 
 for index, definition in indices.items():

From 94cd1d98a34c2a2c22b45e0ae9076dcbf7d54e44 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Fri, 4 Jul 2025 19:54:58 +0530
Subject: [PATCH 02/30] INTEGRITY: Separate the additional checksum add logic
 from insert_filechecksum. Add new equal checksums for set.dat's fileset match

---
 db_functions.py | 55 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 45adc5d..de74d58 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -233,26 +233,34 @@ def insert_file(file, detection, src, conn):
         cursor.execute("SET @file_last = LAST_INSERT_ID()")
 
 
-def insert_filechecksum(file, checktype, conn):
+def insert_filechecksum(file, checktype, file_id, conn):
     if checktype not in file:
         return
 
     checksum = file[checktype]
     checksize, checktype, checksum = get_checksum_props(checktype, checksum)
 
-    query = f"INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (@file_last, '{checksize}', '{checktype}', '{checksum}')"
+    query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)"
+    with conn.cursor() as cursor:
+        cursor.execute(query, (file_id, checksize, checktype, checksum))
+
+    add_all_equal_checksums(checksize, checktype, checksum, file_id, conn)
+
+
+def add_all_equal_checksums(checksize, checktype, checksum, file_id, conn):
+    """
+    We can update all the checksums when file size is less than the checksum size type, as all checksums are equal in that case.
+    """
     with conn.cursor() as cursor:
-        cursor.execute(query)
         if "md5" not in checktype:
             return
-
         size_name = "size"
         if checktype[-1] == "r":
             size_name += "-rd"
         if checktype[-1] == "s":
             size_name += "-d"
 
-        cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = @file_last")
+        cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = {file_id}")
         result = cursor.fetchone()
         if not result:
             return
@@ -281,9 +289,10 @@ def insert_filechecksum(file, checktype, conn):
                     checksum_size = exploded.pop()
                     checksum_type = "-".join(exploded)
 
-                    query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (@file_last, %s, %s, %s)"
-                    with conn.cursor() as cursor:
-                        cursor.execute(query, (checksum_size, checksum_type, checksum))
+                    query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)"
+                    cursor.execute(
+                        query, (file_id, checksum_size, checksum_type, checksum)
+                    )
 
 
 def delete_filesets(conn):
@@ -558,9 +567,13 @@ def db_insert(data_arr, username=None, skiplog=False):
 
             for file in unique_files:
                 insert_file(file, detection, src, conn)
+                file_id = None
+                with conn.cursor() as cursor:
+                    cursor.execute("SELECT @file_last AS file_id")
+                    file_id = cursor.fetchone()["file_id"]
                 for key, value in file.items():
                     if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]:
-                        insert_filechecksum(file, key, conn)
+                        insert_filechecksum(file, key, file_id, conn)
 
     if detection:
         conn.cursor().execute(
@@ -1070,7 +1083,6 @@ def set_perform_match(
     with conn.cursor() as cursor:
         if len(candidate_filesets) == 1:
             matched_fileset_id = candidate_filesets[0]
-
             cursor.execute(
                 "SELECT status FROM fileset WHERE id = %s", (matched_fileset_id,)
             )
@@ -1123,6 +1135,7 @@ def set_perform_match(
 
         elif len(candidate_filesets) > 1:
             found_match = False
+
             for candidate_fileset in candidate_filesets:
                 (is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn)
                 if is_match:
@@ -1579,7 +1592,7 @@ def populate_file(fileset, fileset_id, conn, detection):
 
             for key, value in file.items():
                 if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]:
-                    insert_filechecksum(file, key, conn)
+                    insert_filechecksum(file, key, file_id, conn)
                     if value in target_files_dict and not file_exists:
                         cursor.execute(
                             f"SELECT detection_type FROM file WHERE id = {target_files_dict[value]['id']}"
@@ -1683,7 +1696,10 @@ def set_populate_file(fileset, fileset_id, conn, detection):
                 cursor.execute("SET @file_last = LAST_INSERT_ID()")
                 cursor.execute("SELECT @file_last AS file_id")
 
-                insert_filechecksum(file, "md5", conn)
+                cursor.execute("SELECT @file_last AS file_id")
+                file_id = cursor.fetchone()["file_id"]
+
+                insert_filechecksum(file, "md5", file_id, conn)
 
             else:
                 query = """
@@ -1701,6 +1717,7 @@ def set_populate_file(fileset, fileset_id, conn, detection):
                         candidate_files[filename.lower()][0],
                     ),
                 )
+
                 query = """
                     INSERT INTO filechecksum (file, checksize, checktype, checksum)
                     VALUES (%s, %s, %s, %s)
@@ -1714,6 +1731,14 @@ def set_populate_file(fileset, fileset_id, conn, detection):
                         checksum,
                     ),
                 )
+
+                add_all_equal_checksums(
+                    checksize,
+                    checktype,
+                    checksum,
+                    candidate_files[filename.lower()][0],
+                    conn,
+                )
                 seen_detection_files.add((filename.lower(), file["size"]))
 
 
@@ -1745,9 +1770,13 @@ def insert_new_fileset(
     if fileset_id:
         for file in fileset["rom"]:
             insert_file(file, detection, src, conn)
+            file_id = None
+            with conn.cursor() as cursor:
+                cursor.execute("SELECT @file_last AS file_id")
+                file_id = cursor.fetchone()["file_id"]
             for key, value in file.items():
                 if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]:
-                    insert_filechecksum(file, key, conn)
+                    insert_filechecksum(file, key, file_id, conn)
     return (fileset_id, existing)
 
 

From cc82c6f065cbeea1e7867b203dd4e3b0f8c3527a Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Fri, 4 Jul 2025 23:37:48 +0530
Subject: [PATCH 03/30] INTEGRITY: Add filtering by platform for set.dat to
 reduce manual merge.

---
 db_functions.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/db_functions.py b/db_functions.py
index de74d58..f405f41 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -1028,6 +1028,12 @@ def set_process(
 
     for fileset_id, candidate_filesets in set_to_candidate_dict.items():
         fileset = id_to_fileset_dict[fileset_id]
+
+        # Filter by platform to reduce manual merge
+        candidate_filesets = set_filter_by_platform(
+            fileset["name"], candidate_filesets, conn
+        )
+
         (
             fully_matched_filesets,
             auto_merged_filesets,
@@ -1063,6 +1069,47 @@ def set_process(
         create_log(escape_string(category_text), user, escape_string(log_text), conn)
 
 
+def set_filter_by_platform(gameid, candidate_filesets, conn):
+    """
+    Return - list(number) : list of fileset ids of filtered candidates.
+    The number of manual merges in case the file size is not present (equal to -1) are too high. So we try to filter by platform extracted from the gameId of the set.dat fileset. We may disable this feature later or keep it optional with a command line argument.
+    """
+    with conn.cursor() as cursor:
+        # e.g. sq2-coco3-1
+        possible_platform_names = gameid.split("-")[1:]
+
+        # Align platform names in set.dat and detection entries
+        for i, platform in enumerate(possible_platform_names):
+            if platform == "win":
+                possible_platform_names[i] = "windows"
+            elif platform == "mac":
+                possible_platform_names[i] = "macintosh"
+
+        filtered_candidate_fileset = []
+
+        for candidate_fileset_id in candidate_filesets:
+            query = """
+                SELECT g.platform
+                FROM fileset fs
+                JOIN game g ON g.id = fs.game
+                WHERE fs.id = %s
+            """
+            cursor.execute(query, (candidate_fileset_id,))
+            candidate_platform = cursor.fetchone()["platform"]
+            if candidate_platform in possible_platform_names:
+                filtered_candidate_fileset.append(candidate_fileset_id)
+
+        if len(filtered_candidate_fileset) != 0:
+            print(len(candidate_filesets), " ", len(filtered_candidate_fileset), "\n")
+
+        # If nothing was filtred, then it is likely, that platform information was not present, so we fallback to original list of candidates.
+        return (
+            candidate_filesets
+            if len(filtered_candidate_fileset) == 0
+            else filtered_candidate_fileset
+        )
+
+
 def set_perform_match(
     fileset,
     src,

From 599d2f1219a46e18265e2358b4fd2ef7a588902b Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Mon, 7 Jul 2025 18:45:18 +0530
Subject: [PATCH 04/30] INTEGRITY: Avoid adding a fileset as candidate if it
 was marked as partial in the same run.

---
 db_functions.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index f405f41..3b18fd6 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -1034,6 +1034,16 @@ def set_process(
             fileset["name"], candidate_filesets, conn
         )
 
+        for candidate_fileset in candidate_filesets:
+            with conn.cursor() as cursor:
+                cursor.execute(
+                    "SELECT id FROM fileset WHERE status = 'current' AND id = %s",
+                    (candidate_fileset),
+                )
+                result = cursor.fetchone()
+                if result:
+                    candidate_filesets.remove(candidate_fileset)
+
         (
             fully_matched_filesets,
             auto_merged_filesets,
@@ -1056,6 +1066,8 @@ def set_process(
 
     # Final log
     with conn.cursor() as cursor:
+        cursor.execute("UPDATE fileset SET status = 'partial' WHERE status = 'current'")
+
         cursor.execute(
             "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s",
             (transaction_id,),
@@ -1099,9 +1111,6 @@ def set_filter_by_platform(gameid, candidate_filesets, conn):
             if candidate_platform in possible_platform_names:
                 filtered_candidate_fileset.append(candidate_fileset_id)
 
-        if len(filtered_candidate_fileset) != 0:
-            print(len(candidate_filesets), " ", len(filtered_candidate_fileset), "\n")
-
         # If nothing was filtred, then it is likely, that platform information was not present, so we fallback to original list of candidates.
         return (
             candidate_filesets
@@ -1135,7 +1144,7 @@ def set_perform_match(
             )
             status = cursor.fetchone()["status"]
             if status == "detection":
-                update_fileset_status(cursor, matched_fileset_id, "partial")
+                update_fileset_status(cursor, matched_fileset_id, "current")
                 set_populate_file(fileset, matched_fileset_id, conn, detection)
                 auto_merged_filesets += 1
                 if not skiplog:
@@ -1186,7 +1195,7 @@ def set_perform_match(
             for candidate_fileset in candidate_filesets:
                 (is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn)
                 if is_match:
-                    update_fileset_status(cursor, candidate_fileset, "partial")
+                    update_fileset_status(cursor, candidate_fileset, "current")
                     set_populate_file(fileset, candidate_fileset, conn, detection)
                     auto_merged_filesets += 1
                     if not skiplog:

From 6ea2bbae514a151cb0cec1f78e7525afdd87bcbe Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Mon, 7 Jul 2025 22:45:20 +0530
Subject: [PATCH 05/30] INTEGRITY: Add additional filtering logic for glk
 engines

---
 db_functions.py | 127 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 115 insertions(+), 12 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 3b18fd6..8800a8b 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -965,9 +965,17 @@ def set_process(
         if existing:
             continue
 
-        candidate_filesets = set_filter_candidate_filesets(
-            fileset_id, fileset, transaction_id, conn
-        )
+        # Separating out the matching logic for glk engine
+        engine_name = fileset["sourcefile"].split("-")[0]
+
+        if engine_name == "glk":
+            candidate_filesets = set_glk_filter_candidate_filesets(
+                fileset_id, fileset, transaction_id, engine_name, conn
+            )
+        else:
+            candidate_filesets = set_filter_candidate_filesets(
+                fileset_id, fileset, transaction_id, conn
+            )
 
         # Mac files in set.dat are not represented properly and they won't find a candidate fileset for a match, so we can drop them.
         if len(candidate_filesets) == 0:
@@ -1288,6 +1296,84 @@ def is_full_checksum_match(candidate_fileset, fileset, conn):
         return (len(unmatched_files) == 0, unmatched_files)
 
 
+def set_glk_filter_candidate_filesets(
+    fileset_id, fileset, transaction_id, engine_name, conn
+):
+    """
+    Returns a list of candidate filesets for glk engines that can be merged
+    """
+    with conn.cursor() as cursor:
+        # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches
+
+        query = """
+            WITH candidate_fileset AS ( 
+            SELECT fs.id AS fileset_id, f.size
+            FROM file f
+            JOIN fileset fs ON f.fileset = fs.id
+            JOIN game g ON g.id = fs.game
+            JOIN engine e ON e.id = g.engine
+            JOIN transactions t ON t.fileset = fs.id
+            WHERE fs.id != %s
+            AND e.engineid = %s
+            AND f.detection = 1
+            AND t.transaction != %s
+            AND (g.gameid = %s OR (g.gameid != %s AND g.gameid LIKE %s))
+            ),
+            total_detection_files AS (
+            SELECT cf.fileset_id, COUNT(*) AS detection_files_found
+            FROM candidate_fileset cf
+            GROUP BY fileset_id
+            ),
+            set_fileset AS (
+            SELECT size FROM file
+            WHERE fileset = %s
+            ),
+            matched_detection_files AS (
+            SELECT cf.fileset_id, COUNT(*) AS match_files_count
+            FROM candidate_fileset cf
+            JOIN set_fileset sf ON
+            cf.size = sf.size OR cf.size = 0
+            GROUP BY cf.fileset_id
+            ),
+            valid_matched_detection_files AS (
+            SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
+            FROM matched_detection_files mdf
+            JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
+            WHERE tdf.detection_files_found <= mdf.match_files_count
+            ),
+            max_match_count AS (
+                SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
+            )
+            SELECT vmdf.fileset_id
+            FROM valid_matched_detection_files vmdf
+            JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
+            JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
+        """
+
+        gameid_pattern = f"%{fileset['name']}%"
+
+        cursor.execute(
+            query,
+            (
+                fileset_id,
+                engine_name,
+                transaction_id,
+                fileset["name"],
+                fileset["name"],
+                gameid_pattern,
+                fileset_id,
+            ),
+        )
+        rows = cursor.fetchall()
+
+        candidates = []
+        if rows:
+            for row in rows:
+                candidates.append(row["fileset_id"])
+
+        return candidates
+
+
 def set_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn):
     """
     Returns a list of candidate filesets that can be merged
@@ -1715,6 +1801,13 @@ def set_populate_file(fileset, fileset_id, conn, detection):
             for target_file in target_files
         }
 
+        # For glk engines
+        candidate_file_size = {
+            target_file["size"]: target_file["id"] for target_file in target_files
+        }
+
+        engine_name = fileset["sourcefile"].split("-")[0]
+
         seen_detection_files = set()
 
         for file in fileset["rom"]:
@@ -1724,13 +1817,16 @@ def set_populate_file(fileset, fileset_id, conn, detection):
 
             filename = os.path.basename(normalised_path(file["name"]))
 
-            if ((filename.lower(), file["size"]) in seen_detection_files) or (
-                filename.lower() not in candidate_files
+            if (engine_name == "glk" and file["size"] not in candidate_file_size) and (
+                (filename.lower(), file["size"]) in seen_detection_files
                 or (
-                    filename.lower() in candidate_files
-                    and (
-                        candidate_files[filename.lower()][1] != -1
-                        and candidate_files[filename.lower()][1] != file["size"]
+                    filename.lower() not in candidate_files
+                    or (
+                        filename.lower() in candidate_files
+                        and (
+                            candidate_files[filename.lower()][1] != -1
+                            and candidate_files[filename.lower()][1] != file["size"]
+                        )
                     )
                 )
             ):
@@ -1764,13 +1860,16 @@ def set_populate_file(fileset, fileset_id, conn, detection):
                     name = %s
                     WHERE id = %s
                 """
+
                 # Filtering was by filename, but we are still updating the file with the original filepath.
                 cursor.execute(
                     query,
                     (
                         file["size"],
                         normalised_path(file["name"]),
-                        candidate_files[filename.lower()][0],
+                        candidate_files[filename.lower()][0]
+                        if engine_name != "glk"
+                        else candidate_file_size[file["size"]],
                     ),
                 )
 
@@ -1781,7 +1880,9 @@ def set_populate_file(fileset, fileset_id, conn, detection):
                 cursor.execute(
                     query,
                     (
-                        candidate_files[filename.lower()][0],
+                        candidate_files[filename.lower()][0]
+                        if engine_name != "glk"
+                        else candidate_file_size[file["size"]],
                         checksize,
                         checktype,
                         checksum,
@@ -1792,7 +1893,9 @@ def set_populate_file(fileset, fileset_id, conn, detection):
                     checksize,
                     checktype,
                     checksum,
-                    candidate_files[filename.lower()][0],
+                    candidate_files[filename.lower()][0]
+                    if engine_name != "glk"
+                    else candidate_file_size[file["size"]],
                     conn,
                 )
                 seen_detection_files.add((filename.lower(), file["size"]))

From 3028d19c84d822efc2b30cd220fc2472d0f7a36e Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Tue, 8 Jul 2025 02:35:31 +0530
Subject: [PATCH 06/30] INTEGRITY: Add timestamp field in scan.dat and
 filtering support via modification time

---
 compute_hash.py | 56 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/compute_hash.py b/compute_hash.py
index 0067cfd..b5c743a 100644
--- a/compute_hash.py
+++ b/compute_hash.py
@@ -4,6 +4,8 @@
 import struct
 import sys
 from enum import Enum
+from datetime import datetime, date
+from collections import defaultdict
 
 class FileType(Enum):
     NON_MAC = "non_mac"
@@ -154,7 +156,6 @@ def is_actual_resource_fork_mac(filepath):
     """ Returns boolean, checking the actual mac fork if it exists. """
 
     resource_fork_path = os.path.join(filepath, "..namedfork", "rsrc")
-    print(resource_fork_path)
     return os.path.exists(resource_fork_path)
 
 def is_appledouble(file_byte_stream):
@@ -505,7 +506,7 @@ def file_filter(files):
     for file in to_be_deleted:
         del files[file]
 
-def compute_hash_of_dirs(root_directory, depth, size=0, alg="md5"):
+def compute_hash_of_dirs(root_directory, depth, size=0, limit_timestamps_date=None, alg="md5"):
     """ Return dictionary containing checksums of all files in directory """
     res = []
 
@@ -518,10 +519,14 @@ def compute_hash_of_dirs(root_directory, depth, size=0, alg="md5"):
         for root, _, contents in os.walk(directory):
             files.extend([os.path.join(root, f) for f in contents])
 
+        # Filter out the files based on user input date - limit_timestamps_date
+        filtered_file_map = filter_files_by_timestamp(files, limit_timestamp_date)
+
         # Produce filetype and filename(name to be used in game entry) for each file
-        for filepath in files:
+        for filepath in filtered_file_map:
             file_collection[filepath] = file_classification(filepath)
 
+
         # Remove extra entries of macfiles to avoid extra checksum calculation in form of non mac files
         # Checksum for both the forks are calculated using a single file, so other files should be removed from the collection
         file_filter(file_collection)
@@ -538,11 +543,45 @@ def compute_hash_of_dirs(root_directory, depth, size=0, alg="md5"):
                 relative_dir = os.path.dirname(os.path.dirname(relative_path))
                 relative_path = os.path.join(relative_dir, base_name) 
 
-            hash_of_dir[relative_path] = file_checksum(file_path, alg, size, file_info)
+            hash_of_dir[relative_path] = file_checksum(file_path, alg, size, file_info) + (filtered_file_map[file_path],)
 
         res.append(hash_of_dir)
     return res
 
+
+def validate_date(date_str):
+    """
+    Confirms if the user provided timestamp is in a valid format.
+    Returns the date as a datetime object.
+    """
+    formats = ["%Y-%m-%d", "%Y-%m", "%Y"]
+    for fmt in formats:
+        try:
+            return datetime.strptime(date_str, fmt).date()
+        except ValueError:
+            continue
+    raise ValueError("Invalid date format. Use YYYY, YYYY-MM, or YYYY-MM-DD")
+
+
+def filter_files_by_timestamp(files, limit_timestamps_date):
+    """
+    Removes the files those were modified after a certain timestamp provided by the user.
+    The files those were modified today are kept.
+    Returns filtered map with filepath and its modification time
+    """
+
+    filtered_file_map = defaultdict(str)
+    user_date = validate_date(limit_timestamps_date)
+    today = date.today()
+
+    for filepath in files:
+        mtime = datetime.fromtimestamp(os.path.getmtime(filepath)).date()
+        if limit_timestamps_date is None or (limit_timestamps_date is not None and (mtime <= user_date or mtime == today)):
+            filtered_file_map[filepath] = str(mtime)
+
+    return filtered_file_map
+
+
 def create_dat_file(hash_of_dirs, path, checksum_size=0):
     with open(f"{os.path.basename(path)}.dat", "w") as file:
         # Header
@@ -556,8 +595,8 @@ def create_dat_file(hash_of_dirs, path, checksum_size=0):
         # Game files
         for hash_of_dir in hash_of_dirs:
             file.write("game (\n")
-            for filename, (hashes, filesize) in hash_of_dir.items():
-                data = f"name \"{filename}\" size {filesize}"
+            for filename, (hashes, filesize, timestamp) in hash_of_dir.items():
+                data = f"name \"{filename}\" size {filesize} timestamp {timestamp}"
                 for key, value in hashes:
                     data += f" {key} {value}"
 
@@ -579,10 +618,13 @@ def error(self, message):
                     help="Depth from root to game directories")
 parser.add_argument("--size",
                     help="Use first n bytes of file to calculate checksum")
+parser.add_argument("--limit-timestamps",
+                    help="Format - YYYY-MM-DD or YYYY-MM or YYYY. Filters out the files those were modified after the given timestamp. Note that if the modification time is today, it would not be filtered out.")
 args = parser.parse_args()
 path = os.path.abspath(args.directory) if args.directory else os.getcwd()
 depth = int(args.depth) if args.depth else 0
 checksum_size = int(args.size) if args.size else 0
+limit_timestamp_date = str(args.limit_timestamps) if args.limit_timestamps else None
 
 create_dat_file(compute_hash_of_dirs(
-    path, depth, checksum_size), path, checksum_size)
+    path, depth, checksum_size, limit_timestamp_date), path, checksum_size)

From 19b19c0224d6d332a991bbd66a3d63d3982779e9 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Tue, 8 Jul 2025 03:17:42 +0530
Subject: [PATCH 07/30] INTEGRITY: Add all size variants to scan.dat - size,
 size-r and size-rd.

---
 compute_hash.py | 89 +++++++++++++++++++++++++++----------------------
 1 file changed, 49 insertions(+), 40 deletions(-)

diff --git a/compute_hash.py b/compute_hash.py
index b5c743a..3b0e155 100644
--- a/compute_hash.py
+++ b/compute_hash.py
@@ -179,7 +179,7 @@ def is_appledouble(file_byte_stream):
     return True
 
 def macbin_get_resfork_data(file_byte_stream):
-    """ Returns the resource fork's data section as bytes of a macbinary file as well as its size """
+    """ Returns the resource fork's data section as bytes, data fork size (size), resource fork size (size-r) and data section of resource fork size (size-rd) of a macbinary file """
 
     if not file_byte_stream:
         return file_byte_stream
@@ -189,10 +189,10 @@ def macbin_get_resfork_data(file_byte_stream):
     (rsrclen,) = struct.unpack(">I", file_byte_stream[0x57:0x5B])
 
     resoure_fork_offset = 128 + datalen_padded
-    data_offset = int.from_bytes(file_byte_stream[resoure_fork_offset+0 : resoure_fork_offset+4])
-    data_length = int.from_bytes(file_byte_stream[resoure_fork_offset+8 : resoure_fork_offset+12])
+    rd_offset = int.from_bytes(file_byte_stream[resoure_fork_offset+0 : resoure_fork_offset+4])
+    rd_length = int.from_bytes(file_byte_stream[resoure_fork_offset+8 : resoure_fork_offset+12])
 
-    return (file_byte_stream[resoure_fork_offset + data_offset: resoure_fork_offset + data_offset + data_length], data_length)
+    return (file_byte_stream[resoure_fork_offset + rd_offset: resoure_fork_offset + rd_offset + rd_length], datalen, rsrclen, rd_length)
 
 def macbin_get_datafork(file_byte_stream):
     if not file_byte_stream:
@@ -222,7 +222,7 @@ def is_appledouble(file_byte_stream):
     return True
 
 def appledouble_get_resfork_data(file_byte_stream):
-    """ Returns the resource fork's data section as bytes of an appledouble file as well as its size """
+    """ Returns the resource fork's data section as bytes, size of resource fork (size-r) and size of data section of resource fork (size-rd) of an appledouble file"""
     
     entry_count = read_be_16(file_byte_stream[24:])
     for entry in range(entry_count):
@@ -233,13 +233,13 @@ def appledouble_get_resfork_data(file_byte_stream):
 
         if id == 2:
             resource_fork_stream = file_byte_stream[offset:offset+length]
-            data_offset = int.from_bytes(resource_fork_stream[0:4])
-            data_length = int.from_bytes(resource_fork_stream[8:12])
+            rd_offset = int.from_bytes(resource_fork_stream[0:4])
+            rd_length = int.from_bytes(resource_fork_stream[8:12])
 
-            return (resource_fork_stream[data_offset: data_offset+data_length], data_length)
+            return (resource_fork_stream[rd_offset: rd_offset+rd_length], length, rd_length)
 
 def appledouble_get_datafork(filepath, fileinfo):
-    """ Returns data fork's content as bytes of appledouble file if found, otherwise empty byte string """
+    """ Returns data fork's content as bytes and size of data fork of an appledouble file."""
     try:
         index = filepath.index("__MACOSX")
     except ValueError:
@@ -253,50 +253,54 @@ def appledouble_get_datafork(filepath, fileinfo):
 
     try:
         with open(data_fork_path, "rb") as f:
-            return f.read()
+            data = f.read()
+            return (data, len(data))
     except (FileNotFoundError, IsADirectoryError):
         return b''
 
 def raw_rsrc_get_datafork(filepath):
-    """ Returns the data fork's content as bytes corresponding to raw rsrc file. """
+    """ Returns the data fork's content as bytes and size of the data fork corresponding to raw rsrc file. """
     try:
         with open(filepath[:-5]+".data", "rb") as f:
-            return f.read()
+            data = f.read()
+            return (data, len(data))
     except (FileNotFoundError, IsADirectoryError):
         return b''
 
 def raw_rsrc_get_resource_fork_data(filepath):
-    """ Returns the resource fork's data section as bytes of a raw rsrc file as well as its size """
+    """ Returns the resource fork's data section as bytes, size of resource fork (size-r) and size of data section of resource fork (size-rd) of a raw rsrc file."""
     with open(filepath, "rb") as f:
         resource_fork_stream = f.read()
-        data_offset = int.from_bytes(resource_fork_stream[0:4])
-        data_length = int.from_bytes(resource_fork_stream[8:12])
+        resource_fork_len = len(resource_fork_stream)
+        rd_offset = int.from_bytes(resource_fork_stream[0:4])
+        rd_length = int.from_bytes(resource_fork_stream[8:12])
 
-        return (resource_fork_stream[data_offset: data_offset+data_length], data_length)
+        return (resource_fork_stream[rd_offset: rd_offset+rd_length], resource_fork_len, rd_length)
 
 def actual_mac_fork_get_data_fork(filepath):
-    """ Returns the data fork's content as bytes if the actual mac fork exists """
+    """ Returns the data fork's content as bytes and its size if the actual mac fork exists """
     try:
         with open(filepath, "rb") as f:
-            return f.read()
+            data = f.read()
+            return (data, len(data))
     except (FileNotFoundError, IsADirectoryError):
         return b''
 
 def actual_mac_fork_get_resource_fork_data(filepath):
-    """ Returns the resource fork's data section as bytes of the actual mac fork as well as its size """
+    """ Returns the resource fork's data section as bytes, size of resource fork (size-r) and size of data section of resource fork (size-rd) of the actual mac fork."""
     resource_fork_path = os.path.join(filepath, "..namedfork", "rsrc")
     with open(resource_fork_path, "rb") as f:
         resource_fork_stream = f.read()
-        data_offset = int.from_bytes(resource_fork_stream[0:4])
-        data_length = int.from_bytes(resource_fork_stream[8:12])
+        resource_fork_len = len(resource_fork_stream)
+        rd_offset = int.from_bytes(resource_fork_stream[0:4])
+        rd_length = int.from_bytes(resource_fork_stream[8:12])
 
-        return (resource_fork_stream[data_offset: data_offset+data_length], data_length)
+        return (resource_fork_stream[rd_offset: rd_offset+rd_length], resource_fork_len, rd_length)
 
-def file_checksum(filepath, alg, size, file_info):
-    cur_file_size = 0
+def file_checksum(filepath, alg, custom_checksum_size, file_info):
     with open(filepath, "rb") as f:
         if file_info[0] == FileType.NON_MAC:
-            return (create_checksum_pairs(checksum(f, alg, size, filepath), alg, size), filesize(filepath))
+            return (create_checksum_pairs(checksum(f, alg, custom_checksum_size, filepath), alg, custom_checksum_size), filesize(filepath), 0, 0)
         
         # Processing mac files
         res = []
@@ -304,29 +308,33 @@ def file_checksum(filepath, alg, size, file_info):
         datafork = b''
         file_data = f.read()
 
+        size = 0
+        size_r = 0
+        size_rd = 0
+
         if file_info[0] == FileType.MAC_BINARY:
-            (resfork, cur_file_size) = macbin_get_resfork_data(file_data)
+            (resfork, size, size_r, size_rd) = macbin_get_resfork_data(file_data)
             datafork = macbin_get_datafork(file_data)
         elif file_info[0] in {FileType.APPLE_DOUBLE_DOT_, FileType.APPLE_DOUBLE_RSRC, FileType.APPLE_DOUBLE_MACOSX}:
-            (resfork, cur_file_size) = appledouble_get_resfork_data(file_data)
-            datafork = appledouble_get_datafork(filepath, file_info)
+            (resfork, size_r, size_rd) = appledouble_get_resfork_data(file_data)
+            (datafork, size) = appledouble_get_datafork(filepath, file_info)
         elif file_info[0] == FileType.RAW_RSRC:
-            (resfork, cur_file_size) = raw_rsrc_get_resource_fork_data(filepath)
-            datafork = raw_rsrc_get_datafork(filepath)
+            (resfork, size_r, size_rd) = raw_rsrc_get_resource_fork_data(filepath)
+            datafork, size = raw_rsrc_get_datafork(filepath)
         elif file_info[0] == FileType.ACTUAL_FORK_MAC:
-            (resfork, cur_file_size) = actual_mac_fork_get_resource_fork_data(filepath)
-            datafork = actual_mac_fork_get_data_fork(filepath)
+            (resfork, size_r, size_rd) = actual_mac_fork_get_resource_fork_data(filepath)
+            (datafork, size) = actual_mac_fork_get_data_fork(filepath)
 
-        hashes = checksum(resfork, alg, size, filepath)
+        hashes = checksum(resfork, alg, custom_checksum_size, filepath)
         prefix = 'r'
         if len(resfork):
-            res.extend(create_checksum_pairs(hashes, alg, size, prefix))
+            res.extend(create_checksum_pairs(hashes, alg, custom_checksum_size, prefix))
 
-        hashes = checksum(datafork, alg, size, filepath)
+        hashes = checksum(datafork, alg, custom_checksum_size, filepath)
         prefix = 'd'
-        res.extend(create_checksum_pairs(hashes, alg, size, prefix))
+        res.extend(create_checksum_pairs(hashes, alg, custom_checksum_size, prefix))
 
-        return (res, cur_file_size)
+        return (res, size, size_r, size_rd)
 
 def create_checksum_pairs(hashes, alg, size, prefix=None):
     res = []
@@ -571,7 +579,8 @@ def filter_files_by_timestamp(files, limit_timestamps_date):
     """
 
     filtered_file_map = defaultdict(str)
-    user_date = validate_date(limit_timestamps_date)
+    if limit_timestamp_date is not None:
+        user_date = validate_date(limit_timestamps_date)
     today = date.today()
 
     for filepath in files:
@@ -595,8 +604,8 @@ def create_dat_file(hash_of_dirs, path, checksum_size=0):
         # Game files
         for hash_of_dir in hash_of_dirs:
             file.write("game (\n")
-            for filename, (hashes, filesize, timestamp) in hash_of_dir.items():
-                data = f"name \"{filename}\" size {filesize} timestamp {timestamp}"
+            for filename, (hashes, size, size_r, size_rd, timestamp) in hash_of_dir.items():
+                data = f"name \"{filename}\" size {size} size-r {size_r} size-rd {size_rd} timestamp {timestamp}"
                 for key, value in hashes:
                     data += f" {key} {value}"
 

From 074da92875fda1751230c50c1c1df953a1721850 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Tue, 8 Jul 2025 19:52:49 +0530
Subject: [PATCH 08/30] INTEGRITY: Fix clear database hang issue. Now the
 database is dropped and recreated.

---
 db_functions.py |  18 ++
 fileset.py      |  20 +-
 schema.py       | 570 ++++++++++++++++++++++++------------------------
 3 files changed, 314 insertions(+), 294 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 8800a8b..393906b 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -32,6 +32,24 @@ def db_connect():
     return conn
 
 
+def db_connect_root():
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    config_path = os.path.join(base_dir, "mysql_config.json")
+    with open(config_path) as f:
+        mysql_cred = json.load(f)
+
+    conn = pymysql.connect(
+        host=mysql_cred["servername"],
+        user=mysql_cred["username"],
+        password=mysql_cred["password"],
+        charset="utf8mb4",
+        cursorclass=pymysql.cursors.DictCursor,
+        autocommit=True,
+    )
+
+    return (conn, mysql_cred["dbname"])
+
+
 def get_checksum_props(checkcode, checksum):
     checksize = 0
     checktype = checkcode
diff --git a/fileset.py b/fileset.py
index 9b9dc93..3f7bc40 100644
--- a/fileset.py
+++ b/fileset.py
@@ -23,8 +23,10 @@
     user_integrity_check,
     db_connect,
     create_log,
+    db_connect_root,
 )
 from collections import defaultdict
+from schema import init_database
 
 app = Flask(__name__)
 
@@ -79,21 +81,13 @@ def index():
 @app.route("/clear_database", methods=["POST"])
 def clear_database():
     try:
-        conn = db_connect()
+        (conn, db_name) = db_connect_root()
         with conn.cursor() as cursor:
-            cursor.execute("SET FOREIGN_KEY_CHECKS = 0;")
-            cursor.execute("TRUNCATE TABLE filechecksum")
-            cursor.execute("TRUNCATE TABLE history")
-            cursor.execute("TRUNCATE TABLE transactions")
-            cursor.execute("TRUNCATE TABLE queue")
-            cursor.execute("TRUNCATE TABLE file")
-            cursor.execute("TRUNCATE TABLE fileset")
-            cursor.execute("TRUNCATE TABLE game")
-            cursor.execute("TRUNCATE TABLE engine")
-            cursor.execute("TRUNCATE TABLE log")
-            cursor.execute("SET FOREIGN_KEY_CHECKS = 1;")
+            cursor.execute(f"DROP DATABASE IF EXISTS {db_name}")
             conn.commit()
-            print("DATABASE CLEARED")
+            print("DATABASE DROPPED")
+        init_database()
+        print("DATABASE INITIALISED")
     except Exception as e:
         print(f"Error clearing database: {e}")
     finally:
diff --git a/schema.py b/schema.py
index 2ebfeeb..826be7b 100644
--- a/schema.py
+++ b/schema.py
@@ -5,302 +5,310 @@
 from datetime import datetime
 import os
 
-# Load MySQL credentials
-base_dir = os.path.dirname(os.path.abspath(__file__))
-config_path = os.path.join(base_dir, "mysql_config.json")
-with open(config_path) as f:
-    mysql_cred = json.load(f)
-
-servername = mysql_cred["servername"]
-username = mysql_cred["username"]
-password = mysql_cred["password"]
-dbname = mysql_cred["dbname"]
-
-# Create connection
-conn = pymysql.connect(
-    host=servername,
-    user=username,
-    password=password,
-    charset="utf8mb4",
-    cursorclass=pymysql.cursors.DictCursor,
-    autocommit=False,
-)
-
-# Check connection
-if conn is None:
-    print("Error connecting to MySQL")
-    exit(1)
-
-cursor = conn.cursor()
-
-# Create database
-sql = f"CREATE DATABASE IF NOT EXISTS {dbname}"
-cursor.execute(sql)
-
-# Use database
-cursor.execute(f"USE {dbname}")
-
-# Create tables
-tables = {
-    "engine": """
-        CREATE TABLE IF NOT EXISTS engine (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            name VARCHAR(200),
-            engineid VARCHAR(100) NOT NULL
-        )
-    """,
-    "game": """
-        CREATE TABLE IF NOT EXISTS game (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            name VARCHAR(200),
-            engine INT NOT NULL,
-            gameid VARCHAR(100) NOT NULL,
-            extra VARCHAR(200),
-            platform VARCHAR(30),
-            language VARCHAR(10),
-            FOREIGN KEY (engine) REFERENCES engine(id)
-        )
-    """,
-    "fileset": """
-        CREATE TABLE IF NOT EXISTS fileset (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            game INT,
-            status VARCHAR(20),
-            src VARCHAR(20),
-            `key` VARCHAR(64),
-            `megakey` VARCHAR(64),
-            `delete` BOOLEAN DEFAULT FALSE NOT NULL,
-            `timestamp` TIMESTAMP NOT NULL,
-            detection_size INT,
-            FOREIGN KEY (game) REFERENCES game(id)
-        )
-    """,
-    "file": """
-        CREATE TABLE IF NOT EXISTS file (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            name VARCHAR(200) NOT NULL,
-            size BIGINT NOT NULL,
-            checksum VARCHAR(64) NOT NULL,
-            fileset INT NOT NULL,
-            detection BOOLEAN NOT NULL,
-            FOREIGN KEY (fileset) REFERENCES fileset(id) ON DELETE CASCADE
-        )
-    """,
-    "filechecksum": """
-        CREATE TABLE IF NOT EXISTS filechecksum (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            file INT NOT NULL,
-            checksize VARCHAR(10) NOT NULL,
-            checktype VARCHAR(10) NOT NULL,
-            checksum VARCHAR(64) NOT NULL,
-            FOREIGN KEY (file) REFERENCES file(id) ON DELETE CASCADE
-        )
-    """,
-    "queue": """
-        CREATE TABLE IF NOT EXISTS queue (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            time TIMESTAMP NOT NULL,
-            notes varchar(300),
-            fileset INT,
-            userid INT NOT NULL,
-            commit VARCHAR(64) NOT NULL,
-            FOREIGN KEY (fileset) REFERENCES fileset(id)
-        )
-    """,
-    "log": """
-        CREATE TABLE IF NOT EXISTS log (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            `timestamp` TIMESTAMP NOT NULL,
-            category VARCHAR(100) NOT NULL,
-            user VARCHAR(100) NOT NULL,
-            `text` varchar(300)
-        )
-    """,
-    "history": """
-        CREATE TABLE IF NOT EXISTS history (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            `timestamp` TIMESTAMP NOT NULL,
-            fileset INT NOT NULL,
-            oldfileset INT NOT NULL,
-            log INT
-        )
-    """,
-    "transactions": """
-        CREATE TABLE IF NOT EXISTS transactions (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            `transaction` INT NOT NULL,
-            fileset INT NOT NULL
-        )
-    """,
-    "possible_merges": """
-        CREATE TABLE IF NOT EXISTS possible_merges (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            child_fileset INT,
-            parent_fileset INT,
-            FOREIGN KEY (child_fileset) REFERENCES fileset(id) ON DELETE CASCADE,
-            FOREIGN KEY (parent_fileset) REFERENCES fileset(id) ON DELETE CASCADE
-        )
-    """,
-}
 
-for table, definition in tables.items():
-    try:
-        cursor.execute(definition)
-        print(f"Table '{table}' created successfully")
-    except pymysql.Error as err:
-        print(f"Error creating '{table}' table: {err}")
-
-# Create indices
-indices = {
-    "detection": "CREATE INDEX detection ON file (detection)",
-    "checksum": "CREATE INDEX checksum ON filechecksum (checksum)",
-    "engineid": "CREATE INDEX engineid ON engine (engineid)",
-    "key": "CREATE INDEX fileset_key ON fileset (`key`)",
-    "status": "CREATE INDEX status ON fileset (status)",
-    "fileset": "CREATE INDEX fileset ON history (fileset)",
-    "file_name_size": "CREATE INDEX file_name_size ON file (name, size)",
-    "file_fileset_detection": "CREATE INDEX file_fileset_detection ON file (fileset, detection)",
-}
-
-try:
-    cursor.execute("ALTER TABLE file ADD COLUMN detection_type VARCHAR(20);")
-except Exception:
-    # if aleady exists, change the length of the column
-    cursor.execute("ALTER TABLE file MODIFY COLUMN detection_type VARCHAR(20);")
-
-try:
-    cursor.execute("ALTER TABLE file ADD COLUMN `timestamp` TIMESTAMP NOT NULL;")
-except Exception:
-    # if aleady exists, change the length of the column
-    cursor.execute("ALTER TABLE file MODIFY COLUMN `timestamp` TIMESTAMP NOT NULL;")
-
-try:
-    cursor.execute("ALTER TABLE fileset ADD COLUMN `user_count` INT;")
-except Exception:
-    # if aleady exists, change the length of the column
-    cursor.execute("ALTER TABLE fileset MODIFY COLUMN `user_count` INT;")
-
-try:
-    cursor.execute("ALTER TABLE file ADD COLUMN punycode_name VARCHAR(200);")
-except Exception:
-    cursor.execute("ALTER TABLE file MODIFY COLUMN punycode_name VARCHAR(200);")
-
-try:
-    cursor.execute(
-        "ALTER TABLE file ADD COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';"
-    )
-except Exception:
-    cursor.execute(
-        "ALTER TABLE file MODIFY COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';"
+def init_database():
+    # Load MySQL credentials
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    config_path = os.path.join(base_dir, "mysql_config.json")
+    with open(config_path) as f:
+        mysql_cred = json.load(f)
+
+    servername = mysql_cred["servername"]
+    username = mysql_cred["username"]
+    password = mysql_cred["password"]
+    dbname = mysql_cred["dbname"]
+
+    # Create connection
+    conn = pymysql.connect(
+        host=servername,
+        user=username,
+        password=password,
+        charset="utf8mb4",
+        cursorclass=pymysql.cursors.DictCursor,
+        autocommit=False,
     )
 
-try:
-    cursor.execute(
-        "ALTER TABLE file ADD COLUMN `size-r` BIGINT DEFAULT 0, ADD COLUMN `size-rd` BIGINT DEFAULT 0;"
-    )
-except Exception:
-    cursor.execute(
-        "ALTER TABLE file MODIFY COLUMN `size-r` BIGINT DEFAULT 0, MODIFY COLUMN `size-rd` BIGINT DEFAULT 0;"
-    )
-try:
-    cursor.execute("ALTER TABLE log ADD COLUMN `text` varchar(1000);")
-except Exception:
-    cursor.execute("ALTER TABLE log MODIFY COLUMN `text` varchar(1000);")
+    # Check connection
+    if conn is None:
+        print("Error connecting to MySQL")
+        exit(1)
+
+    cursor = conn.cursor()
+
+    # Create database
+    sql = f"CREATE DATABASE IF NOT EXISTS {dbname}"
+    cursor.execute(sql)
+
+    # Use database
+    cursor.execute(f"USE {dbname}")
+
+    # Create tables
+    tables = {
+        "engine": """
+            CREATE TABLE IF NOT EXISTS engine (
+                id INT AUTO_INCREMENT PRIMARY KEY,
+                name VARCHAR(200),
+                engineid VARCHAR(100) NOT NULL
+            )
+        """,
+        "game": """
+            CREATE TABLE IF NOT EXISTS game (
+                id INT AUTO_INCREMENT PRIMARY KEY,
+                name VARCHAR(200),
+                engine INT NOT NULL,
+                gameid VARCHAR(100) NOT NULL,
+                extra VARCHAR(200),
+                platform VARCHAR(30),
+                language VARCHAR(10),
+                FOREIGN KEY (engine) REFERENCES engine(id)
+            )
+        """,
+        "fileset": """
+            CREATE TABLE IF NOT EXISTS fileset (
+                id INT AUTO_INCREMENT PRIMARY KEY,
+                game INT,
+                status VARCHAR(20),
+                src VARCHAR(20),
+                `key` VARCHAR(64),
+                `megakey` VARCHAR(64),
+                `delete` BOOLEAN DEFAULT FALSE NOT NULL,
+                `timestamp` TIMESTAMP NOT NULL,
+                detection_size INT,
+                FOREIGN KEY (game) REFERENCES game(id)
+            )
+        """,
+        "file": """
+            CREATE TABLE IF NOT EXISTS file (
+                id INT AUTO_INCREMENT PRIMARY KEY,
+                name VARCHAR(200) NOT NULL,
+                size BIGINT NOT NULL,
+                checksum VARCHAR(64) NOT NULL,
+                fileset INT NOT NULL,
+                detection BOOLEAN NOT NULL,
+                FOREIGN KEY (fileset) REFERENCES fileset(id) ON DELETE CASCADE
+            )
+        """,
+        "filechecksum": """
+            CREATE TABLE IF NOT EXISTS filechecksum (
+                id INT AUTO_INCREMENT PRIMARY KEY,
+                file INT NOT NULL,
+                checksize VARCHAR(10) NOT NULL,
+                checktype VARCHAR(10) NOT NULL,
+                checksum VARCHAR(64) NOT NULL,
+                FOREIGN KEY (file) REFERENCES file(id) ON DELETE CASCADE
+            )
+        """,
+        "queue": """
+            CREATE TABLE IF NOT EXISTS queue (
+                id INT AUTO_INCREMENT PRIMARY KEY,
+                time TIMESTAMP NOT NULL,
+                notes varchar(300),
+                fileset INT,
+                userid INT NOT NULL,
+                commit VARCHAR(64) NOT NULL,
+                FOREIGN KEY (fileset) REFERENCES fileset(id)
+            )
+        """,
+        "log": """
+            CREATE TABLE IF NOT EXISTS log (
+                id INT AUTO_INCREMENT PRIMARY KEY,
+                `timestamp` TIMESTAMP NOT NULL,
+                category VARCHAR(100) NOT NULL,
+                user VARCHAR(100) NOT NULL,
+                `text` varchar(300)
+            )
+        """,
+        "history": """
+            CREATE TABLE IF NOT EXISTS history (
+                id INT AUTO_INCREMENT PRIMARY KEY,
+                `timestamp` TIMESTAMP NOT NULL,
+                fileset INT NOT NULL,
+                oldfileset INT NOT NULL,
+                log INT
+            )
+        """,
+        "transactions": """
+            CREATE TABLE IF NOT EXISTS transactions (
+                id INT AUTO_INCREMENT PRIMARY KEY,
+                `transaction` INT NOT NULL,
+                fileset INT NOT NULL
+            )
+        """,
+        "possible_merges": """
+            CREATE TABLE IF NOT EXISTS possible_merges (
+                id INT AUTO_INCREMENT PRIMARY KEY,
+                child_fileset INT,
+                parent_fileset INT,
+                FOREIGN KEY (child_fileset) REFERENCES fileset(id) ON DELETE CASCADE,
+                FOREIGN KEY (parent_fileset) REFERENCES fileset(id) ON DELETE CASCADE
+            )
+        """,
+    }
+
+    for table, definition in tables.items():
+        try:
+            cursor.execute(definition)
+            print(f"Table '{table}' created successfully")
+        except pymysql.Error as err:
+            print(f"Error creating '{table}' table: {err}")
+
+    # Create indices
+    indices = {
+        "detection": "CREATE INDEX detection ON file (detection)",
+        "checksum": "CREATE INDEX checksum ON filechecksum (checksum)",
+        "engineid": "CREATE INDEX engineid ON engine (engineid)",
+        "key": "CREATE INDEX fileset_key ON fileset (`key`)",
+        "status": "CREATE INDEX status ON fileset (status)",
+        "fileset": "CREATE INDEX fileset ON history (fileset)",
+        "file_name_size": "CREATE INDEX file_name_size ON file (name, size)",
+        "file_fileset_detection": "CREATE INDEX file_fileset_detection ON file (fileset, detection)",
+    }
 
-
-for index, definition in indices.items():
     try:
-        cursor.execute(definition)
-        print(f"Created index for '{index}'")
-    except pymysql.Error as err:
-        print(f"Error creating index for '{index}': {err}")
-
-
-# Insert random data into tables
-def random_string(length=10):
-    return "".join(random.choices(string.ascii_letters + string.digits, k=length))
-
-
-def insert_random_data():
-    for _ in range(1000):
-        # Insert data into engine
-        cursor.execute(
-            "INSERT INTO engine (name, engineid) VALUES (%s, %s)",
-            (random_string(), random_string()),
-        )
+        cursor.execute("ALTER TABLE file ADD COLUMN detection_type VARCHAR(20);")
+    except Exception:
+        # if aleady exists, change the length of the column
+        cursor.execute("ALTER TABLE file MODIFY COLUMN detection_type VARCHAR(20);")
 
-        # Insert data into game
-        cursor.execute(
-            "INSERT INTO game (name, engine, gameid, extra, platform, language) VALUES (%s, %s, %s, %s, %s, %s)",
-            (
-                random_string(),
-                1,
-                random_string(),
-                random_string(),
-                random_string(),
-                random_string(),
-            ),
-        )
-
-        # Insert data into fileset
-        cursor.execute(
-            "INSERT INTO fileset (game, status, src, `key`, `megakey`, `timestamp`, detection_size) VALUES (%s, %s, %s, %s, %s, %s, %s)",
-            (
-                1,
-                "user",
-                random_string(),
-                random_string(),
-                random_string(),
-                datetime.now(),
-                random.randint(1, 100),
-            ),
-        )
+    try:
+        cursor.execute("ALTER TABLE file ADD COLUMN `timestamp` TIMESTAMP NOT NULL;")
+    except Exception:
+        # if aleady exists, change the length of the column
+        cursor.execute("ALTER TABLE file MODIFY COLUMN `timestamp` TIMESTAMP NOT NULL;")
 
-        # Insert data into file
-        cursor.execute(
-            "INSERT INTO file (name, size, checksum, fileset, detection) VALUES (%s, %s, %s, %s, %s)",
-            (random_string(), random.randint(1000, 10000), random_string(), 1, True),
-        )
+    try:
+        cursor.execute("ALTER TABLE fileset ADD COLUMN `user_count` INT;")
+    except Exception:
+        # if aleady exists, change the length of the column
+        cursor.execute("ALTER TABLE fileset MODIFY COLUMN `user_count` INT;")
 
-        # Insert data into filechecksum
-        cursor.execute(
-            "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)",
-            (1, random_string(), random_string(), random_string()),
-        )
+    try:
+        cursor.execute("ALTER TABLE file ADD COLUMN punycode_name VARCHAR(200);")
+    except Exception:
+        cursor.execute("ALTER TABLE file MODIFY COLUMN punycode_name VARCHAR(200);")
 
-        # Insert data into queue
+    try:
         cursor.execute(
-            "INSERT INTO queue (time, notes, fileset, userid, commit) VALUES (%s, %s, %s, %s, %s)",
-            (
-                datetime.now(),
-                random_string(),
-                1,
-                random.randint(1, 100),
-                random_string(),
-            ),
+            "ALTER TABLE file ADD COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';"
         )
-
-        # Insert data into log
+    except Exception:
         cursor.execute(
-            "INSERT INTO log (`timestamp`, category, user, `text`) VALUES (%s, %s, %s, %s)",
-            (datetime.now(), random_string(), random_string(), random_string()),
+            "ALTER TABLE file MODIFY COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';"
         )
 
-        # Insert data into history
+    try:
         cursor.execute(
-            "INSERT INTO history (`timestamp`, fileset, oldfileset, log) VALUES (%s, %s, %s, %s)",
-            (datetime.now(), 1, 2, 1),
+            "ALTER TABLE file ADD COLUMN `size-r` BIGINT DEFAULT 0, ADD COLUMN `size-rd` BIGINT DEFAULT 0;"
         )
-
-        # Insert data into transactions
+    except Exception:
         cursor.execute(
-            "INSERT INTO transactions (`transaction`, fileset) VALUES (%s, %s)",
-            (random.randint(1, 100), 1),
+            "ALTER TABLE file MODIFY COLUMN `size-r` BIGINT DEFAULT 0, MODIFY COLUMN `size-rd` BIGINT DEFAULT 0;"
         )
-
-
-# for testing locally
-# insert_random_data()
-
-conn.commit()
-conn.close()
+    try:
+        cursor.execute("ALTER TABLE log ADD COLUMN `text` varchar(5000);")
+    except Exception:
+        cursor.execute("ALTER TABLE log MODIFY COLUMN `text` varchar(5000);")
+
+    for index, definition in indices.items():
+        try:
+            cursor.execute(definition)
+            print(f"Created index for '{index}'")
+        except pymysql.Error as err:
+            print(f"Error creating index for '{index}': {err}")
+
+    # Insert random data into tables
+    def random_string(length=10):
+        return "".join(random.choices(string.ascii_letters + string.digits, k=length))
+
+    def insert_random_data():
+        for _ in range(1000):
+            # Insert data into engine
+            cursor.execute(
+                "INSERT INTO engine (name, engineid) VALUES (%s, %s)",
+                (random_string(), random_string()),
+            )
+
+            # Insert data into game
+            cursor.execute(
+                "INSERT INTO game (name, engine, gameid, extra, platform, language) VALUES (%s, %s, %s, %s, %s, %s)",
+                (
+                    random_string(),
+                    1,
+                    random_string(),
+                    random_string(),
+                    random_string(),
+                    random_string(),
+                ),
+            )
+
+            # Insert data into fileset
+            cursor.execute(
+                "INSERT INTO fileset (game, status, src, `key`, `megakey`, `timestamp`, detection_size) VALUES (%s, %s, %s, %s, %s, %s, %s)",
+                (
+                    1,
+                    "user",
+                    random_string(),
+                    random_string(),
+                    random_string(),
+                    datetime.now(),
+                    random.randint(1, 100),
+                ),
+            )
+
+            # Insert data into file
+            cursor.execute(
+                "INSERT INTO file (name, size, checksum, fileset, detection) VALUES (%s, %s, %s, %s, %s)",
+                (
+                    random_string(),
+                    random.randint(1000, 10000),
+                    random_string(),
+                    1,
+                    True,
+                ),
+            )
+
+            # Insert data into filechecksum
+            cursor.execute(
+                "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)",
+                (1, random_string(), random_string(), random_string()),
+            )
+
+            # Insert data into queue
+            cursor.execute(
+                "INSERT INTO queue (time, notes, fileset, userid, commit) VALUES (%s, %s, %s, %s, %s)",
+                (
+                    datetime.now(),
+                    random_string(),
+                    1,
+                    random.randint(1, 100),
+                    random_string(),
+                ),
+            )
+
+            # Insert data into log
+            cursor.execute(
+                "INSERT INTO log (`timestamp`, category, user, `text`) VALUES (%s, %s, %s, %s)",
+                (datetime.now(), random_string(), random_string(), random_string()),
+            )
+
+            # Insert data into history
+            cursor.execute(
+                "INSERT INTO history (`timestamp`, fileset, oldfileset, log) VALUES (%s, %s, %s, %s)",
+                (datetime.now(), 1, 2, 1),
+            )
+
+            # Insert data into transactions
+            cursor.execute(
+                "INSERT INTO transactions (`transaction`, fileset) VALUES (%s, %s)",
+                (random.randint(1, 100), 1),
+            )
+
+    # for testing locally
+    # insert_random_data()
+
+    conn.commit()
+    conn.close()
+
+
+if __name__ == "__main__":
+    init_database()

From 31b7d4ffc19fb2ea3cb9948d87060f62fba5dbc9 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Tue, 8 Jul 2025 23:39:53 +0530
Subject: [PATCH 09/30] INTEGRITY: Remove global database connection object
 from fileset.py, which is never closed.

---
 fileset.py | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/fileset.py b/fileset.py
index 3f7bc40..dc91d33 100644
--- a/fileset.py
+++ b/fileset.py
@@ -32,21 +32,6 @@
 
 secret_key = os.urandom(24)
 
-base_dir = os.path.dirname(os.path.abspath(__file__))
-config_path = os.path.join(base_dir, "mysql_config.json")
-with open(config_path) as f:
-    mysql_cred = json.load(f)
-
-conn = pymysql.connect(
-    host=mysql_cred["servername"],
-    user=mysql_cred["username"],
-    password=mysql_cred["password"],
-    db=mysql_cred["dbname"],
-    charset="utf8mb4",
-    cursorclass=pymysql.cursors.DictCursor,
-    autocommit=False,
-)
-
 
 @app.route("/")
 def index():
@@ -348,7 +333,7 @@ def fileset():
             html += "<th>Description</th>\n"
             html += "<th>Log Text</th>\n"
 
-            related_filesets = get_all_related_filesets(id, conn)
+            related_filesets = get_all_related_filesets(id, connection)
 
             cursor.execute(
                 f"SELECT * FROM history WHERE fileset IN ({','.join(map(str, related_filesets))}) OR oldfileset IN ({','.join(map(str, related_filesets))})"
@@ -971,9 +956,12 @@ def validate():
         del json_response["files"]
         json_response["status"] = "no_metadata"
 
-        fileset_id = user_insert_fileset(json_object, ip, conn)
+        conn = db_connect()
+        try:
+            fileset_id = user_insert_fileset(json_object, ip, conn)
+        finally:
+            conn.close()
         json_response["fileset"] = fileset_id
-        print(f"Response: {json_response}")
         return jsonify(json_response)
 
     matched_map = {}

From e125227f3e94c705f81b965f2e6556da63b403eb Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Thu, 10 Jul 2025 00:04:52 +0530
Subject: [PATCH 10/30] INTEGRITY: Filter manual merge candidates if size
 mismatch.

---
 db_functions.py | 169 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 135 insertions(+), 34 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 393906b..77fa453 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -1052,6 +1052,8 @@ def set_process(
                 del set_to_candidate_dict[set_fileset]
                 del id_to_fileset_dict[set_fileset]
 
+    manual_merge_map = defaultdict(list)
+
     for fileset_id, candidate_filesets in set_to_candidate_dict.items():
         fileset = id_to_fileset_dict[fileset_id]
 
@@ -1060,16 +1062,6 @@ def set_process(
             fileset["name"], candidate_filesets, conn
         )
 
-        for candidate_fileset in candidate_filesets:
-            with conn.cursor() as cursor:
-                cursor.execute(
-                    "SELECT id FROM fileset WHERE status = 'current' AND id = %s",
-                    (candidate_fileset),
-                )
-                result = cursor.fetchone()
-                if result:
-                    candidate_filesets.remove(candidate_fileset)
-
         (
             fully_matched_filesets,
             auto_merged_filesets,
@@ -1086,14 +1078,31 @@ def set_process(
             auto_merged_filesets,
             manual_merged_filesets,
             mismatch_filesets,
+            manual_merge_map,
+            set_to_candidate_dict,
             conn,
             skiplog,
         )
 
+    # print(manual_merge_map)
+
+    for fileset_id, candidates in manual_merge_map.items():
+        category_text = "Manual Merge Required"
+        log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidates)}."
+        manual_merged_filesets += 1
+        # print(candidates)
+        add_manual_merge(
+            candidates,
+            fileset_id,
+            category_text,
+            log_text,
+            log_text,
+            user,
+            conn,
+        )
+
     # Final log
     with conn.cursor() as cursor:
-        cursor.execute("UPDATE fileset SET status = 'partial' WHERE status = 'current'")
-
         cursor.execute(
             "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s",
             (transaction_id,),
@@ -1156,6 +1165,8 @@ def set_perform_match(
     auto_merged_filesets,
     manual_merged_filesets,
     mismatch_filesets,
+    manual_merge_map,
+    set_to_candidate_dict,
     conn,
     skiplog,
 ):
@@ -1170,7 +1181,7 @@ def set_perform_match(
             )
             status = cursor.fetchone()["status"]
             if status == "detection":
-                update_fileset_status(cursor, matched_fileset_id, "current")
+                update_fileset_status(cursor, matched_fileset_id, "parital")
                 set_populate_file(fileset, matched_fileset_id, conn, detection)
                 auto_merged_filesets += 1
                 if not skiplog:
@@ -1183,6 +1194,9 @@ def set_perform_match(
                         conn,
                     )
                 delete_original_fileset(fileset_id, conn)
+                remove_manual_merge_if_size_mismatch(
+                    matched_fileset_id, manual_merge_map, set_to_candidate_dict, conn
+                )
             elif status == "partial" or status == "full":
                 (is_match, unmatched_files) = is_full_checksum_match(
                     matched_fileset_id, fileset, conn
@@ -1221,7 +1235,7 @@ def set_perform_match(
             for candidate_fileset in candidate_filesets:
                 (is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn)
                 if is_match:
-                    update_fileset_status(cursor, candidate_fileset, "current")
+                    update_fileset_status(cursor, candidate_fileset, "partial")
                     set_populate_file(fileset, candidate_fileset, conn, detection)
                     auto_merged_filesets += 1
                     if not skiplog:
@@ -1234,22 +1248,14 @@ def set_perform_match(
                             conn,
                         )
                     delete_original_fileset(fileset_id, conn)
+                    remove_manual_merge_if_size_mismatch(
+                        candidate_fileset, manual_merge_map, set_to_candidate_dict, conn
+                    )
                     found_match = True
                     break
 
             if not found_match:
-                category_text = "Manual Merge Required"
-                log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidate_filesets)}."
-                manual_merged_filesets += 1
-                add_manual_merge(
-                    candidate_filesets,
-                    fileset_id,
-                    category_text,
-                    log_text,
-                    log_text,
-                    user,
-                    conn,
-                )
+                manual_merge_map[fileset_id] = candidate_filesets
 
     return (
         fully_matched_filesets,
@@ -1259,6 +1265,98 @@ def set_perform_match(
     )
 
 
+def remove_manual_merge_if_size_mismatch(
+    child_fileset, manual_merge_map, set_to_candidate_dict, conn
+):
+    with conn.cursor() as cursor:
+        query = """
+            SELECT f.name, f.size 
+            FROM fileset fs
+            JOIN file f ON f.fileset = fs.id
+            WHERE fs.id = %s
+            AND f.detection = 1
+        """
+        cursor.execute(query, (child_fileset,))
+        files = cursor.fetchall()
+
+        for parent_fileset, child_list in manual_merge_map.items():
+            if child_fileset not in child_list:
+                continue
+
+            for file in files:
+                if file["size"] == -1:
+                    continue
+
+                query = """
+                    SELECT f.id
+                    FROM fileset fs
+                    JOIN file f ON f.fileset = fs.id
+                    WHERE fs.id = %s
+                    AND f.name = %s
+                    AND f.size = %s
+                """
+                cursor.execute(query, (parent_fileset, file["name"], file["size"]))
+                result = cursor.fetchall()
+
+                if not result:
+                    remove_manual_merge(
+                        child_fileset,
+                        parent_fileset,
+                        manual_merge_map,
+                        set_to_candidate_dict,
+                        conn,
+                    )
+                    break
+
+        for parent_fileset, child_list in set_to_candidate_dict.items():
+            if child_fileset not in child_list:
+                continue
+
+            for file in files:
+                if file["size"] == -1:
+                    continue
+
+                query = """
+                    SELECT f.id
+                    FROM fileset fs
+                    JOIN file f ON f.fileset = fs.id
+                    WHERE fs.id = %s
+                    AND f.name = %s
+                    AND f.size = %s
+                """
+                cursor.execute(query, (parent_fileset, file["name"], file["size"]))
+                result = cursor.fetchall()
+
+                if not result:
+                    remove_manual_merge(
+                        child_fileset,
+                        parent_fileset,
+                        manual_merge_map,
+                        set_to_candidate_dict,
+                        conn,
+                    )
+                    break
+
+
+def remove_manual_merge(
+    child_fileset, parent_fileset, manual_merge_map, set_to_candidate_dict, conn
+):
+    if parent_fileset in manual_merge_map:
+        if child_fileset in manual_merge_map[parent_fileset]:
+            manual_merge_map[parent_fileset].remove(child_fileset)
+    if parent_fileset in set_to_candidate_dict:
+        if child_fileset in set_to_candidate_dict[parent_fileset]:
+            set_to_candidate_dict[parent_fileset].remove(child_fileset)
+
+    with conn.cursor() as cursor:
+        query = """
+                DELETE FROM possible_merges
+                WHERE child_fileset = %s
+                AND parent_fileset = %s
+            """
+        cursor.execute(query, (child_fileset, parent_fileset))
+
+
 def add_manual_merge(
     child_filesets, parent_fileset, category_text, log_text, print_text, user, conn
 ):
@@ -1835,15 +1933,18 @@ def set_populate_file(fileset, fileset_id, conn, detection):
 
             filename = os.path.basename(normalised_path(file["name"]))
 
-            if (engine_name == "glk" and file["size"] not in candidate_file_size) and (
-                (filename.lower(), file["size"]) in seen_detection_files
-                or (
-                    filename.lower() not in candidate_files
+            if (engine_name == "glk" and file["size"] not in candidate_file_size) or (
+                engine_name != "glk"
+                and (
+                    (filename.lower(), file["size"]) in seen_detection_files
                     or (
-                        filename.lower() in candidate_files
-                        and (
-                            candidate_files[filename.lower()][1] != -1
-                            and candidate_files[filename.lower()][1] != file["size"]
+                        filename.lower() not in candidate_files
+                        or (
+                            filename.lower() in candidate_files
+                            and (
+                                candidate_files[filename.lower()][1] != -1
+                                and candidate_files[filename.lower()][1] != file["size"]
+                            )
                         )
                     )
                 )

From 898ffd064332840409f9a1070b13a2e0e8a6ff20 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Thu, 10 Jul 2025 16:18:34 +0530
Subject: [PATCH 11/30] INTEGRITY: Add metadata for set.dat

---
 db_functions.py | 12 +++++++++++-
 fileset.py      | 26 ++++++++++++++++++++------
 schema.py       |  9 +++++++++
 3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 77fa453..2e022e3 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -107,6 +107,7 @@ def insert_fileset(
     transaction,
     log_text,
     conn,
+    set_dat_metadata="",
     ip="",
     username=None,
     skiplog=None,
@@ -162,7 +163,7 @@ def insert_fileset(
         return (existing_entry, True)
 
     # $game and $key should not be parsed as a mysql string, hence no quotes
-    query = f"INSERT INTO fileset (game, status, src, `key`, megakey, `timestamp`) VALUES ({game}, '{status}', '{src}', {key}, {megakey}, FROM_UNIXTIME(@fileset_time_last))"
+    query = f"INSERT INTO fileset (game, status, src, `key`, megakey, `timestamp`, set_dat_metadata) VALUES ({game}, '{status}', '{src}', {key}, {megakey}, FROM_UNIXTIME(@fileset_time_last), '{escape_string(set_dat_metadata)}')"
     fileset_id = -1
     with conn.cursor() as cursor:
         cursor.execute(query)
@@ -968,6 +969,11 @@ def set_process(
         megakey = ""
         log_text = f"State {source_status}."
 
+        set_dat_metadata = ""
+        for meta in fileset:
+            if meta != "rom":
+                set_dat_metadata += meta + " = " + fileset[meta] + "  ,  "
+
         (fileset_id, existing) = insert_new_fileset(
             fileset,
             conn,
@@ -978,8 +984,10 @@ def set_process(
             transaction_id,
             log_text,
             user,
+            set_dat_metadata=set_dat_metadata,
             skiplog=skiplog,
         )
+
         if existing:
             continue
 
@@ -2030,6 +2038,7 @@ def insert_new_fileset(
     transaction_id,
     log_text,
     user,
+    set_dat_metadata="",
     ip="",
     skiplog=False,
 ):
@@ -2042,6 +2051,7 @@ def insert_new_fileset(
         log_text,
         conn,
         username=user,
+        set_dat_metadata=set_dat_metadata,
         ip=ip,
         skiplog=skiplog,
     )
diff --git a/fileset.py b/fileset.py
index dc91d33..85bc093 100644
--- a/fileset.py
+++ b/fileset.py
@@ -155,18 +155,32 @@ def fileset():
                 (id,),
             )
             row = cursor.fetchone()
-            print(row)
             if row:
                 id = row["fileset"]
-            cursor.execute(f"SELECT * FROM fileset WHERE id = {id}")
+            cursor.execute("SELECT status FROM fileset WHERE id = %s", (id,))
+            status = cursor.fetchone()["status"]
+
+            if status == "dat":
+                cursor.execute(
+                    """SELECT id, game, status, src, `key`, megakey, `delete`, timestamp, set_dat_metadata FROM fileset WHERE id = %s""",
+                    (id,),
+                )
+            else:
+                cursor.execute(
+                    """SELECT id, game, status, src, `key`, megakey, `delete`, timestamp, detection_size, user_count FROM fileset WHERE id = %s""",
+                    (id,),
+                )
+
             result = cursor.fetchone()
-            print(result)
             html += "<h3>Fileset details</h3>"
             html += "<table>\n"
             if result["game"]:
-                cursor.execute(
-                    f"SELECT game.name as 'game name', engineid, gameid, extra, platform, language FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = {id}"
-                )
+                if status == "dat":
+                    query = """SELECT game.name as 'game name', engineid, gameid, extra, platform, language, fileset.set_dat_metadata FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = %s"""
+                else:
+                    query = """SELECT game.name as 'game name', engineid, gameid, extra, platform, language FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = %s"""
+                print(query)
+                cursor.execute(query, (id,))
                 result = {**result, **cursor.fetchone()}
             else:
                 # result.pop('key', None)
diff --git a/schema.py b/schema.py
index 826be7b..4b9ba42 100644
--- a/schema.py
+++ b/schema.py
@@ -209,6 +209,15 @@ def init_database():
     except Exception:
         cursor.execute("ALTER TABLE log MODIFY COLUMN `text` varchar(5000);")
 
+    try:
+        cursor.execute(
+            "ALTER TABLE fileset ADD COLUMN set_dat_metadata varchar(5000) DEFAULT 'UTF-8';"
+        )
+    except Exception:
+        cursor.execute(
+            "ALTER TABLE fileset MODIFY COLUMN set_dat_metadata varchar(5000) DEFAULT 'UTF-8';"
+        )
+
     for index, definition in indices.items():
         try:
             cursor.execute(definition)

From 8970cd6a9a04074ccb836f532b499613ccd8b447 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Thu, 10 Jul 2025 18:30:28 +0530
Subject: [PATCH 12/30] INTEGRITY: Add navbar with logo.

---
 fileset.py                                    |  35 ++++++++++++++++++
 pagination.py                                 |   5 +++
 favicon-16x16.png => static/favicon-16x16.png | Bin
 favicon-32x32.png => static/favicon-32x32.png | Bin
 .../integrity_service_logo_256.png            | Bin
 5 files changed, 40 insertions(+)
 rename favicon-16x16.png => static/favicon-16x16.png (100%)
 rename favicon-32x32.png => static/favicon-32x32.png (100%)
 rename integrity_service_logo_256.png => static/integrity_service_logo_256.png (100%)

diff --git a/fileset.py b/fileset.py
index 85bc093..436652d 100644
--- a/fileset.py
+++ b/fileset.py
@@ -42,6 +42,11 @@ def index():
         <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
     </head>
     <body>
+    <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+    <a href="{{ url_for('index') }}">
+        <img src="{{ url_for('static', filename='integrity_service_logo_256.png') }}" alt="Logo" style="height:60px; vertical-align:middle;">
+    </a>
+    </nav>
     <h1>Fileset Database</h1>
     <h2>Fileset Actions</h2>
     <ul>
@@ -138,6 +143,11 @@ def fileset():
                 <link rel="stylesheet" type="text/css" href="{{{{ url_for('static', filename='style.css') }}}}">
             </head>
             <body>
+            <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+                <a href="{{{{ url_for('index') }}}}">
+                    <img src="{{{{ url_for('static', filename='integrity_service_logo_256.png') }}}}" alt="Logo" style="height:60px; vertical-align:middle;">
+                </a>
+            </nav>
             <h2><u>Fileset: {id}</u></h2>
             <table>
             """
@@ -466,6 +476,11 @@ def match_fileset_route(id):
                 <link rel="stylesheet" type="text/css" href="{{{{ url_for('static', filename='style.css') }}}}">
             </head>
             <body>
+            <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+                <a href="{{{{ url_for('index') }}}}">
+                    <img src="{{{{ url_for('static', filename='integrity_service_logo_256.png') }}}}" alt="Logo" style="height:60px; vertical-align:middle;">
+                </a>
+            </nav>
             <h2>Matched Filesets for Fileset: {id}</h2>
             <table>
             <tr>
@@ -553,6 +568,11 @@ def merge_fileset(id):
                     <link rel="stylesheet" type="text/css" href="{{{{ url_for('static', filename='style.css') }}}}">
                 </head>
                 <body>
+                <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+                    <a href="{{{{ url_for('index') }}}}">
+                        <img src="{{{{ url_for('static', filename='integrity_service_logo_256.png') }}}}" alt="Logo" style="height:60px; vertical-align:middle;">
+                    </a>
+                </nav>
                 <h2>Search Results for '{search_query}'</h2>
                 <form method="POST">
                     <input type="text" name="search" placeholder="Search fileset">
@@ -587,6 +607,11 @@ def merge_fileset(id):
         <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
     </head>
     <body>
+    <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+        <a href="{{ url_for('index') }}">
+            <img src="{{ url_for('static', filename='integrity_service_logo_256.png') }}" alt="Logo" style="height:60px; vertical-align:middle;">
+        </a>
+    </nav>
     <h2>Search Fileset to Merge</h2>
     <form method="POST">
         <input type="text" name="search" placeholder="Search fileset">
@@ -641,6 +666,11 @@ def possible_merge_filesets(id):
                 <link rel="stylesheet" type="text/css" href="{{{{ url_for('static', filename='style.css') }}}}">
             </head>
             <body>
+            <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+                <a href="{{{{ url_for('index') }}}}">
+                    <img src="{{{{ url_for('static', filename='integrity_service_logo_256.png') }}}}" alt="Logo" style="height:60px; vertical-align:middle;">
+                </a>
+            </nav>
             <h2>Possible Merges for fileset-'{id}'</h2>
             <table>
             <tr><th>ID</th><th>Game Name</th><th>Platform</th><th>Language</th><th>Extra</th><th>Details</th><th>Action</th></tr>
@@ -748,6 +778,11 @@ def highlight_differences(source, target):
                 <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
             </head>
             <body>
+            <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+                <a href="{{ url_for('index') }}">
+                    <img src="{{ url_for('static', filename='integrity_service_logo_256.png') }}" alt="Logo" style="height:60px; vertical-align:middle;">
+                </a>
+            </nav>
             <h2>Confirm Merge</h2>
             <table border="1">
             <tr><th>Field</th><th>Source Fileset</th><th>Target Fileset</th></tr>
diff --git a/pagination.py b/pagination.py
index cb8ba3d..28b82f5 100644
--- a/pagination.py
+++ b/pagination.py
@@ -141,6 +141,11 @@ def create_page(
         <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
     </head>
     <body>
+    <nav style="padding: 3px; margin-bottom: 20px; border-bottom: 1px solid #ccc;">
+        <a href="{{ url_for('index') }}">
+            <img src="{{ url_for('static', filename='integrity_service_logo_256.png') }}" alt="Logo" style="height:60px; vertical-align:middle;">
+        </a>
+    </nav>
 <form id='filters-form' method='GET' onsubmit='remove_empty_inputs()'>
 <table>
 """
diff --git a/favicon-16x16.png b/static/favicon-16x16.png
similarity index 100%
rename from favicon-16x16.png
rename to static/favicon-16x16.png
diff --git a/favicon-32x32.png b/static/favicon-32x32.png
similarity index 100%
rename from favicon-32x32.png
rename to static/favicon-32x32.png
diff --git a/integrity_service_logo_256.png b/static/integrity_service_logo_256.png
similarity index 100%
rename from integrity_service_logo_256.png
rename to static/integrity_service_logo_256.png

From bd3f2f4ece9f06d3fa05da1d1153974c1456cae1 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Thu, 10 Jul 2025 19:44:37 +0530
Subject: [PATCH 13/30] INTEGRITY: Add modification timestamps for macfiles

---
 compute_hash.py | 77 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 72 insertions(+), 5 deletions(-)

diff --git a/compute_hash.py b/compute_hash.py
index 3b0e155..94bf18b 100644
--- a/compute_hash.py
+++ b/compute_hash.py
@@ -4,7 +4,7 @@
 import struct
 import sys
 from enum import Enum
-from datetime import datetime, date
+from datetime import datetime, date, timedelta
 from collections import defaultdict
 
 class FileType(Enum):
@@ -75,9 +75,10 @@ def get_dirs_at_depth(directory, depth):
         if depth == num_sep_this - num_sep:
             yield root
 
-def read_be_32(byte_stream):
+def read_be_32(byte_stream, signed=False):
     """ Return unsigned integer of size_in_bits, assuming the data is big-endian """
-    (uint,) = struct.unpack(">I", byte_stream[:32//8])
+    format = ">i" if signed else ">I"
+    (uint,) = struct.unpack(format, byte_stream[:32//8])
     return uint
 
 def read_be_16(byte_stream):
@@ -534,7 +535,6 @@ def compute_hash_of_dirs(root_directory, depth, size=0, limit_timestamps_date=No
         for filepath in filtered_file_map:
             file_collection[filepath] = file_classification(filepath)
 
-
         # Remove extra entries of macfiles to avoid extra checksum calculation in form of non mac files
         # Checksum for both the forks are calculated using a single file, so other files should be removed from the collection
         file_filter(file_collection)
@@ -557,6 +557,70 @@ def compute_hash_of_dirs(root_directory, depth, size=0, limit_timestamps_date=No
     return res
 
 
+def extract_macbin_mtime(file_byte_stream):
+    """
+    Returns modification time of macbinary file from the header.
+    Doc - +$5f / 4: modification date/time.
+    Doc - Timestamps are unsigned 32-bit values indicating the time in seconds since midnight on Jan 1, 1904, in local time.
+    """
+    macbin_epoch = datetime(1904, 1, 1)
+    header = file_byte_stream[:128]
+    macbin_seconds = read_be_32(header[0x5f:])
+    return (macbin_epoch + timedelta(seconds=macbin_seconds)).date()
+
+
+def extract_mtime_appledouble(file_byte_stream):
+    """
+    Returns modification time of appledouble file.
+    Doc 1 - The File Dates Info entry (ID=8) consists of the file creation, modification, backup
+    and access times (see Figure 2-1), stored as a signed number of seconds before
+    or after 12:00 a.m. (midnight), January 1, 2000 Greenwich Mean Time (GMT)
+
+    Doc 2 -
+    struct ASFileDates  /* entry ID 8, file dates info */
+   {
+       sint32 create; /* file creation date/time */
+       sint32 modify; /* last modification date/time */
+       sint32 backup; /* last backup date/time */
+       sint32 access; /* last access date/time */
+   }; /* ASFileDates */
+    """
+    entry_count = read_be_16(file_byte_stream[24:])
+    for entry in range(entry_count):
+        start_index = 26 + entry*12
+        id = read_be_32(file_byte_stream[start_index:])
+        offset = read_be_32(file_byte_stream[start_index+4:])
+        length = read_be_32(file_byte_stream[start_index+8:])
+
+        if id == 8:
+            date_info_data = file_byte_stream[offset:offset + length]
+            if len(date_info_data) < 16:
+                raise ValueError("FileDatesInfo block is too short.")
+            appledouble_epoch = datetime(2000, 1, 1)
+            modify_seconds = read_be_32(date_info_data[4:8], signed=True)
+            return (appledouble_epoch + timedelta(seconds=modify_seconds)).date()
+
+    return None
+
+
+def macfile_timestamp(filepath):
+    """
+    Returns the modification times for the mac file from their finderinfo.
+    If the file is not a macfile, it returns None
+    """
+    with open(filepath, "rb") as f:
+        data = f.read()
+        # Macbinary
+        if is_macbin(filepath):
+            return extract_macbin_mtime(data)
+
+        # Appledouble
+        if is_appledouble_rsrc(filepath) or is_appledouble_in_dot_(filepath) or is_appledouble_in_macosx(filepath):
+            return extract_mtime_appledouble(data)
+
+    return None
+
+
 def validate_date(date_str):
     """
     Confirms if the user provided timestamp is in a valid format.
@@ -579,12 +643,15 @@ def filter_files_by_timestamp(files, limit_timestamps_date):
     """
 
     filtered_file_map = defaultdict(str)
+
     if limit_timestamp_date is not None:
         user_date = validate_date(limit_timestamps_date)
     today = date.today()
 
     for filepath in files:
-        mtime = datetime.fromtimestamp(os.path.getmtime(filepath)).date()
+        mtime = macfile_timestamp(filepath)
+        if mtime is None:
+            mtime = datetime.fromtimestamp(os.path.getmtime(filepath)).date()
         if limit_timestamps_date is None or (limit_timestamps_date is not None and (mtime <= user_date or mtime == today)):
             filtered_file_map[filepath] = str(mtime)
 

From 8575f8e90b57c84f89b86fb5536b7b902276e50e Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Thu, 10 Jul 2025 19:45:33 +0530
Subject: [PATCH 14/30] INTEGIRTY: Add punycode encoding for scan utlity.

---
 compute_hash.py | 99 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 80 insertions(+), 19 deletions(-)

diff --git a/compute_hash.py b/compute_hash.py
index 94bf18b..626e622 100644
--- a/compute_hash.py
+++ b/compute_hash.py
@@ -18,6 +18,8 @@ class FileType(Enum):
 
 script_version = "0.1"
 
+SPECIAL_SYMBOLS = '/":*|\\?%<>\x7f'
+
 # CRC table
 CRC16_XMODEM_TABLE = [
     0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
@@ -75,6 +77,83 @@ def get_dirs_at_depth(directory, depth):
         if depth == num_sep_this - num_sep:
             yield root
 
+
+def my_escape_string(s: str) -> str:
+    """
+    Escape strings
+
+    Escape the following:
+    - escape char: \x81
+    - unallowed filename chars: https://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words
+    - control chars < 0x20
+    """
+    new_name = ""
+    for char in s:
+        if char == "\x81":
+            new_name += "\x81\x79"
+        elif char in SPECIAL_SYMBOLS or ord(char) < 0x20:
+            new_name += "\x81" + chr(0x80 + ord(char))
+        else:
+            new_name += char
+    return new_name
+
+
+def encode_punycode(orig):
+    """
+    Punyencode strings
+
+    - escape special characters and
+    - ensure filenames can't end in a space or dotif temp == None:
+    """
+    s = my_escape_string(orig)
+    encoded = s.encode("punycode").decode("ascii")
+    # punyencoding adds an '-' at the end when there are no special chars
+    # don't use it for comparing
+    compare = encoded
+    if encoded.endswith("-"):
+        compare = encoded[:-1]
+    if orig != compare or compare[-1] in " .":
+        return "xn--" + encoded
+    return orig
+
+
+def punycode_need_encode(orig):
+    """
+    A filename needs to be punyencoded when it:
+
+    - contains a char that should be escaped or
+    - ends with a dot or a space.
+    """
+    if len(orig) > 4 and orig[:4] == "xn--":
+        return False
+    if not all((0x20 <= ord(c) < 0x80) and c not in SPECIAL_SYMBOLS for c in orig):
+        return True
+    if orig[-1] in " .":
+        return True
+    return False
+
+
+def split_path_recursive(path):
+    parts = []
+    while True:
+        head, tail = os.path.split(path)
+        if tail:
+            parts.insert(0, tail)
+            path = head
+        else:
+            if head:
+                parts.insert(0, head)
+            break
+    return parts
+
+def encode_path_components(filepath):
+    """
+    Puny encodes all separate components of filepath
+    """
+    parts = split_path_recursive(filepath)
+    encoded_parts = [encode_punycode(p) if punycode_need_encode(p) else p for p in parts]
+    return os.path.join(*encoded_parts)
+
 def read_be_32(byte_stream, signed=False):
     """ Return unsigned integer of size_in_bits, assuming the data is big-endian """
     format = ">i" if signed else ">I"
@@ -202,25 +281,6 @@ def macbin_get_datafork(file_byte_stream):
     (datalen,) = struct.unpack(">I", file_byte_stream[0x53:0x57])
     return file_byte_stream[0x80: 0x80 + datalen]
 
-def is_appledouble(file_byte_stream):
-    """
-    Appledouble Structure -
-
-    Header:
-    +$00 / 4: signature (0x00 0x05 0x16 0x00)
-    +$04 / 4: version (0x00 0x01 0x00 0x00 (v1) -or- 0x00 0x02 0x00 0x00 (v2))
-    +$08 /16: home file system string (v1) -or- zeroes (v2)
-    +$18 / 2: number of entries
-
-    Entries:
-    +$00 / 4: entry ID (1-15)
-    +$04 / 4: offset to data from start of file
-    +$08 / 4: length of entry in bytes; may be zero
-    """
-    if (not file_byte_stream or read_be_32(file_byte_stream) != 0x00051607):
-        return False
-
-    return True
 
 def appledouble_get_resfork_data(file_byte_stream):
     """ Returns the resource fork's data section as bytes, size of resource fork (size-r) and size of data section of resource fork (size-rd) of an appledouble file"""
@@ -672,6 +732,7 @@ def create_dat_file(hash_of_dirs, path, checksum_size=0):
         for hash_of_dir in hash_of_dirs:
             file.write("game (\n")
             for filename, (hashes, size, size_r, size_rd, timestamp) in hash_of_dir.items():
+                filename = encode_path_components(filename)
                 data = f"name \"{filename}\" size {size} size-r {size_r} size-rd {size_rd} timestamp {timestamp}"
                 for key, value in hashes:
                     data += f" {key} {value}"

From df41d7afd55239e50f2e2c97fe2ccaa09aa2df53 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Thu, 10 Jul 2025 20:23:56 +0530
Subject: [PATCH 15/30] INTEGRITY: Fix the navbar on top.

---
 fileset.py    | 28 ++++++++++++++--------------
 pagination.py |  4 ++--
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/fileset.py b/fileset.py
index 436652d..c8e9cf7 100644
--- a/fileset.py
+++ b/fileset.py
@@ -42,12 +42,12 @@ def index():
         <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
     </head>
     <body>
-    <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+    <nav style="position: fixed; top: 0; left: 0; right: 0; background: white; padding: 3px; border-bottom: 1px solid #ccc;">
     <a href="{{ url_for('index') }}">
         <img src="{{ url_for('static', filename='integrity_service_logo_256.png') }}" alt="Logo" style="height:60px; vertical-align:middle;">
     </a>
     </nav>
-    <h1>Fileset Database</h1>
+    <h1 style="margin-top: 80px;">Fileset Database</h1>
     <h2>Fileset Actions</h2>
     <ul>
         <li><a href="{{ url_for('fileset') }}">Fileset</a></li>
@@ -143,12 +143,12 @@ def fileset():
                 <link rel="stylesheet" type="text/css" href="{{{{ url_for('static', filename='style.css') }}}}">
             </head>
             <body>
-            <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+            <nav style="position: fixed; top: 0; left: 0; right: 0; background: white; padding: 3px; border-bottom: 1px solid #ccc;">
                 <a href="{{{{ url_for('index') }}}}">
                     <img src="{{{{ url_for('static', filename='integrity_service_logo_256.png') }}}}" alt="Logo" style="height:60px; vertical-align:middle;">
                 </a>
             </nav>
-            <h2><u>Fileset: {id}</u></h2>
+            <h2 style="margin-top: 80px;"><u>Fileset: {id}</u></h2>
             <table>
             """
             html += f"<button type='button' onclick=\"location.href='/fileset/{id}/merge'\">Manual Merge</button>"
@@ -476,12 +476,12 @@ def match_fileset_route(id):
                 <link rel="stylesheet" type="text/css" href="{{{{ url_for('static', filename='style.css') }}}}">
             </head>
             <body>
-            <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+            <nav style="position: fixed; top: 0; left: 0; right: 0; background: white; padding: 3px; border-bottom: 1px solid #ccc;">
                 <a href="{{{{ url_for('index') }}}}">
                     <img src="{{{{ url_for('static', filename='integrity_service_logo_256.png') }}}}" alt="Logo" style="height:60px; vertical-align:middle;">
                 </a>
             </nav>
-            <h2>Matched Filesets for Fileset: {id}</h2>
+            <h2 style="margin-top: 80px;">Matched Filesets for Fileset: {id}</h2>
             <table>
             <tr>
                 <th>Fileset ID</th>
@@ -568,12 +568,12 @@ def merge_fileset(id):
                     <link rel="stylesheet" type="text/css" href="{{{{ url_for('static', filename='style.css') }}}}">
                 </head>
                 <body>
-                <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+                <nav style="position: fixed; top: 0; left: 0; right: 0; background: white; padding: 3px; border-bottom: 1px solid #ccc;">
                     <a href="{{{{ url_for('index') }}}}">
                         <img src="{{{{ url_for('static', filename='integrity_service_logo_256.png') }}}}" alt="Logo" style="height:60px; vertical-align:middle;">
                     </a>
                 </nav>
-                <h2>Search Results for '{search_query}'</h2>
+                <h2 style="margin-top: 80px;">Search Results for '{search_query}'</h2>
                 <form method="POST">
                     <input type="text" name="search" placeholder="Search fileset">
                     <input type="submit" value="Search">
@@ -607,12 +607,12 @@ def merge_fileset(id):
         <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
     </head>
     <body>
-    <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+    <nav style="position: fixed; top: 0; left: 0; right: 0; background: white; padding: 3px; border-bottom: 1px solid #ccc;">
         <a href="{{ url_for('index') }}">
             <img src="{{ url_for('static', filename='integrity_service_logo_256.png') }}" alt="Logo" style="height:60px; vertical-align:middle;">
         </a>
     </nav>
-    <h2>Search Fileset to Merge</h2>
+    <h2 style="margin-top: 80px;">Search Fileset to Merge</h2>
     <form method="POST">
         <input type="text" name="search" placeholder="Search fileset">
         <input type="submit" value="Search">
@@ -666,12 +666,12 @@ def possible_merge_filesets(id):
                 <link rel="stylesheet" type="text/css" href="{{{{ url_for('static', filename='style.css') }}}}">
             </head>
             <body>
-            <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+            <nav style="position: fixed; top: 0; left: 0; right: 0; background: white; padding: 3px; border-bottom: 1px solid #ccc;">
                 <a href="{{{{ url_for('index') }}}}">
                     <img src="{{{{ url_for('static', filename='integrity_service_logo_256.png') }}}}" alt="Logo" style="height:60px; vertical-align:middle;">
                 </a>
             </nav>
-            <h2>Possible Merges for fileset-'{id}'</h2>
+            <h2 style="margin-top: 80px;">Possible Merges for fileset-'{id}'</h2>
             <table>
             <tr><th>ID</th><th>Game Name</th><th>Platform</th><th>Language</th><th>Extra</th><th>Details</th><th>Action</th></tr>
             """
@@ -778,12 +778,12 @@ def highlight_differences(source, target):
                 <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
             </head>
             <body>
-            <nav style="padding: 3px; border-bottom: 1px solid #ccc;">
+            <nav style="position: fixed; top: 0; left: 0; right: 0; background: white; padding: 3px; border-bottom: 1px solid #ccc;">
                 <a href="{{ url_for('index') }}">
                     <img src="{{ url_for('static', filename='integrity_service_logo_256.png') }}" alt="Logo" style="height:60px; vertical-align:middle;">
                 </a>
             </nav>
-            <h2>Confirm Merge</h2>
+            <h2 style="margin-top: 80px;">Confirm Merge</h2>
             <table border="1">
             <tr><th>Field</th><th>Source Fileset</th><th>Target Fileset</th></tr>
             """
diff --git a/pagination.py b/pagination.py
index 28b82f5..091384c 100644
--- a/pagination.py
+++ b/pagination.py
@@ -141,13 +141,13 @@ def create_page(
         <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
     </head>
     <body>
-    <nav style="padding: 3px; margin-bottom: 20px; border-bottom: 1px solid #ccc;">
+    <nav style="position: fixed; top: 0; left: 0; right: 0; background: white; padding: 3px; border-bottom: 1px solid #ccc;">
         <a href="{{ url_for('index') }}">
             <img src="{{ url_for('static', filename='integrity_service_logo_256.png') }}" alt="Logo" style="height:60px; vertical-align:middle;">
         </a>
     </nav>
 <form id='filters-form' method='GET' onsubmit='remove_empty_inputs()'>
-<table>
+<table style="margin-top: 80px;">
 """
     if not results:
         return "No results for given filters"

From c38881cff93d4cdc2e9675c7f1cf162737344ebc Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Mon, 14 Jul 2025 15:55:57 +0530
Subject: [PATCH 16/30] INTEGRITY: Limit match fileset to 1 in
 remove_manual_merge_if_size_mismatch

---
 db_functions.py | 111 ++++++++++++++++++------------------------------
 1 file changed, 42 insertions(+), 69 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 2e022e3..4c2d927 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -1179,7 +1179,7 @@ def set_perform_match(
     skiplog,
 ):
     """
-    TODO
+    "Performs matching for set.dat"
     """
     with conn.cursor() as cursor:
         if len(candidate_filesets) == 1:
@@ -1189,11 +1189,11 @@ def set_perform_match(
             )
             status = cursor.fetchone()["status"]
             if status == "detection":
-                update_fileset_status(cursor, matched_fileset_id, "parital")
+                update_fileset_status(cursor, matched_fileset_id, "partial")
                 set_populate_file(fileset, matched_fileset_id, conn, detection)
                 auto_merged_filesets += 1
                 if not skiplog:
-                    set_log_matched_fileset(
+                    log_matched_fileset(
                         src,
                         fileset_id,
                         matched_fileset_id,
@@ -1247,7 +1247,7 @@ def set_perform_match(
                     set_populate_file(fileset, candidate_fileset, conn, detection)
                     auto_merged_filesets += 1
                     if not skiplog:
-                        set_log_matched_fileset(
+                        log_matched_fileset(
                             src,
                             fileset_id,
                             candidate_fileset,
@@ -1287,63 +1287,37 @@ def remove_manual_merge_if_size_mismatch(
         cursor.execute(query, (child_fileset,))
         files = cursor.fetchall()
 
-        for parent_fileset, child_list in manual_merge_map.items():
-            if child_fileset not in child_list:
-                continue
-
-            for file in files:
-                if file["size"] == -1:
-                    continue
-
-                query = """
-                    SELECT f.id
-                    FROM fileset fs
-                    JOIN file f ON f.fileset = fs.id
-                    WHERE fs.id = %s
-                    AND f.name = %s
-                    AND f.size = %s
-                """
-                cursor.execute(query, (parent_fileset, file["name"], file["size"]))
-                result = cursor.fetchall()
-
-                if not result:
-                    remove_manual_merge(
-                        child_fileset,
-                        parent_fileset,
-                        manual_merge_map,
-                        set_to_candidate_dict,
-                        conn,
-                    )
-                    break
-
-        for parent_fileset, child_list in set_to_candidate_dict.items():
-            if child_fileset not in child_list:
-                continue
-
-            for file in files:
-                if file["size"] == -1:
+        for possible_removals in [manual_merge_map, set_to_candidate_dict]:
+            for parent_fileset, child_list in possible_removals.items():
+                if child_fileset not in child_list:
                     continue
 
-                query = """
-                    SELECT f.id
-                    FROM fileset fs
-                    JOIN file f ON f.fileset = fs.id
-                    WHERE fs.id = %s
-                    AND f.name = %s
-                    AND f.size = %s
-                """
-                cursor.execute(query, (parent_fileset, file["name"], file["size"]))
-                result = cursor.fetchall()
-
-                if not result:
-                    remove_manual_merge(
-                        child_fileset,
-                        parent_fileset,
-                        manual_merge_map,
-                        set_to_candidate_dict,
-                        conn,
-                    )
-                    break
+                for file in files:
+                    if file["size"] == -1:
+                        continue
+
+                    query = """
+                        SELECT fs.id
+                        FROM fileset fs
+                        JOIN file f ON f.fileset = fs.id
+                        WHERE fs.id = %s
+                        AND REGEXP_REPLACE(f.name, '^.*[\\\\/]', '') = %s
+                        AND f.size = %s
+                        LIMIT 1
+                    """
+                    filename = os.path.basename(normalised_path(file["name"]))
+                    cursor.execute(query, (parent_fileset, filename, file["size"]))
+                    result = cursor.fetchall()
+
+                    if not result:
+                        remove_manual_merge(
+                            child_fileset,
+                            parent_fileset,
+                            manual_merge_map,
+                            set_to_candidate_dict,
+                            conn,
+                        )
+                        break
 
 
 def remove_manual_merge(
@@ -2063,21 +2037,20 @@ def insert_new_fileset(
                 cursor.execute("SELECT @file_last AS file_id")
                 file_id = cursor.fetchone()["file_id"]
             for key, value in file.items():
-                if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]:
+                if key not in [
+                    "name",
+                    "size",
+                    "size-r",
+                    "size-rd",
+                    "sha1",
+                    "crc",
+                    "modification-time",
+                ]:
                     insert_filechecksum(file, key, file_id, conn)
     return (fileset_id, existing)
 
 
 def log_matched_fileset(src, fileset_last, fileset_id, state, user, conn):
-    category_text = f"Matched from {src}"
-    log_text = f"Matched Fileset:{fileset_id}. State {state}."
-    log_last = create_log(
-        escape_string(category_text), user, escape_string(log_text), conn
-    )
-    update_history(fileset_last, fileset_id, conn, log_last)
-
-
-def set_log_matched_fileset(src, fileset_last, fileset_id, state, user, conn):
     category_text = f"Matched from {src}"
     log_text = (
         f"Matched Fileset:{fileset_last} with Fileset:{fileset_id}. State {state}."

From e86f9822ce22c93d605d2d7fd8144b0ea660e18a Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Mon, 14 Jul 2025 18:50:33 +0530
Subject: [PATCH 17/30] INTEGRITY: Improve console logging with progress
 update.

---
 db_functions.py | 112 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 83 insertions(+), 29 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 4c2d927..bb6a2a8 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -9,11 +9,13 @@
 from collections import defaultdict
 import re
 import copy
+import sys
 
 SPECIAL_SYMBOLS = '/":*|\\?%<>\x7f'
 
 
 def db_connect():
+    console_log("Connecting to the Database.")
     base_dir = os.path.dirname(os.path.abspath(__file__))
     config_path = os.path.join(base_dir, "mysql_config.json")
     with open(config_path) as f:
@@ -28,7 +30,7 @@ def db_connect():
         cursorclass=pymysql.cursors.DictCursor,
         autocommit=False,
     )
-
+    console_log(f"Connected to Database - {mysql_cred['dbname']}")
     return conn
 
 
@@ -526,12 +528,17 @@ def db_insert(data_arr, username=None, skiplog=False):
         transaction_id = temp + 1
 
     category_text = f"Uploaded from {src}"
-    log_text = f"Started loading DAT file, size {os.path.getsize(filepath)}, author {author}, version {version}. State {status}. Transaction: {transaction_id}"
+    log_text = f"Started loading DAT file {filepath}, size {os.path.getsize(filepath)}, author {author}, version {version}. State {status}. Transaction: {transaction_id}"
 
     user = f"cli:{getpass.getuser()}" if username is None else username
     create_log(escape_string(category_text), user, escape_string(log_text), conn)
 
+    console_log(log_text)
+    console_log_total_filesets(filepath)
+
+    fileset_count = 1
     for fileset in game_data:
+        console_log_detection(fileset_count)
         key = calc_key(fileset)
         megakey = calc_megakey(fileset)
 
@@ -555,7 +562,7 @@ def db_insert(data_arr, username=None, skiplog=False):
                 if existing_entry is not None:
                     log_text = f"Skipping Entry as similar entry already exsits - Fileset:{existing_entry['id']}. Skpped entry details - engineid = {engineid}, gameid = {gameid}, platform = {platform}, language = {lang}"
                     create_log("Warning", user, escape_string(log_text), conn)
-                    print(log_text)
+                    console_log(log_text)
                     continue
 
             insert_game(
@@ -594,6 +601,8 @@ def db_insert(data_arr, username=None, skiplog=False):
                     if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]:
                         insert_filechecksum(file, key, file_id, conn)
 
+        fileset_count += 1
+
     if detection:
         conn.cursor().execute(
             "UPDATE fileset SET status = 'obsolete' WHERE `timestamp` != FROM_UNIXTIME(@fileset_time_last) AND status = 'detection'"
@@ -607,6 +616,7 @@ def db_insert(data_arr, username=None, skiplog=False):
         fileset_insertion_count = cur.fetchone()["COUNT(fileset)"]
         category_text = f"Uploaded from {src}"
         log_text = f"Completed loading DAT file, filename {filepath}, size {os.path.getsize(filepath)}, author {author}, version {version}. State {status}. Number of filesets: {fileset_insertion_count}. Transaction: {transaction_id}"
+        console_log(log_text)
     except Exception as e:
         print("Inserting failed:", e)
     else:
@@ -871,8 +881,9 @@ def match_fileset(data_arr, username=None, skiplog=False):
         transaction_id = transaction_id + 1 if transaction_id else 1
 
     category_text = f"Uploaded from {src}"
-    log_text = f"Started loading DAT file, size {os.path.getsize(filepath)}, author {author}, version {version}. State {source_status}. Transaction: {transaction_id}"
-
+    log_text = f"Started loading DAT file {filepath}, size {os.path.getsize(filepath)}, author {author}, version {version}. State {source_status}. Transaction: {transaction_id}"
+    console_log(log_text)
+    console_log_total_filesets(filepath)
     user = f"cli:{getpass.getuser()}" if username is None else username
     create_log(escape_string(category_text), user, escape_string(log_text), conn)
 
@@ -941,6 +952,9 @@ def set_process(
     mismatch_filesets = 0
     dropped_early_no_candidate = 0
     dropped_early_single_candidate_multiple_sets = 0
+
+    fileset_count = 0
+
     # A mapping from set filesets to candidate filesets list
     set_to_candidate_dict = defaultdict(list)
     id_to_fileset_dict = defaultdict(dict)
@@ -995,12 +1009,12 @@ def set_process(
         engine_name = fileset["sourcefile"].split("-")[0]
 
         if engine_name == "glk":
-            candidate_filesets = set_glk_filter_candidate_filesets(
-                fileset_id, fileset, transaction_id, engine_name, conn
+            (candidate_filesets, fileset_count) = set_glk_filter_candidate_filesets(
+                fileset_id, fileset, fileset_count, transaction_id, engine_name, conn
             )
         else:
-            candidate_filesets = set_filter_candidate_filesets(
-                fileset_id, fileset, transaction_id, conn
+            (candidate_filesets, fileset_count) = set_filter_candidate_filesets(
+                fileset_id, fileset, fileset_count, transaction_id, conn
             )
 
         # Mac files in set.dat are not represented properly and they won't find a candidate fileset for a match, so we can drop them.
@@ -1016,10 +1030,18 @@ def set_process(
             )
             dropped_early_no_candidate += 1
             delete_original_fileset(fileset_id, conn)
-
         id_to_fileset_dict[fileset_id] = fileset
         set_to_candidate_dict[fileset_id].extend(candidate_filesets)
 
+    console_message = "Candidate filtering finished."
+    console_log(console_message)
+    console_message = (
+        f"{dropped_early_no_candidate} Filesets Dropped - No candidates found."
+    )
+    console_log(console_message)
+    console_message = "Looking for duplicates..."
+    console_log(console_message)
+
     # Remove all such filesets, which have many to one mapping with a single candidate, those are extra variants.
     value_to_keys = defaultdict(list)
     for set_fileset, candidates in set_to_candidate_dict.items():
@@ -1052,6 +1074,7 @@ def set_process(
                     fileset["description"] if "description" in fileset else ""
                 )
                 log_text = f"Drop fileset, multiple filesets mapping to single detection. Name: {fileset_name}, Description: {fileset_description}. Clashed with Fileset:{candidate} ({engine}:{gameid}-{platform}-{language})"
+                console_log(log_text)
                 create_log(
                     escape_string(category_text), user, escape_string(log_text), conn
                 )
@@ -1062,7 +1085,9 @@ def set_process(
 
     manual_merge_map = defaultdict(list)
 
+    match_count = 1
     for fileset_id, candidate_filesets in set_to_candidate_dict.items():
+        console_log_matching(match_count)
         fileset = id_to_fileset_dict[fileset_id]
 
         # Filter by platform to reduce manual merge
@@ -1092,21 +1117,15 @@ def set_process(
             skiplog,
         )
 
-    # print(manual_merge_map)
+        match_count += 1
+    console_log("Matching performed.")
 
     for fileset_id, candidates in manual_merge_map.items():
         category_text = "Manual Merge Required"
         log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidates)}."
         manual_merged_filesets += 1
-        # print(candidates)
         add_manual_merge(
-            candidates,
-            fileset_id,
-            category_text,
-            log_text,
-            log_text,
-            user,
-            conn,
+            candidates, fileset_id, category_text, log_text, user, conn, log_text
         )
 
     # Final log
@@ -1121,6 +1140,7 @@ def set_process(
         create_log(escape_string(category_text), user, escape_string(log_text), conn)
         category_text = "Upload information"
         log_text = f"Number of filesets: {fileset_insertion_count}. Filesets automatically merged: {auto_merged_filesets}. Filesets dropped early (no candidate) - {dropped_early_no_candidate}. Filesets dropped early (mapping to single detection) - {dropped_early_single_candidate_multiple_sets}. Filesets requiring manual merge: {manual_merged_filesets}. Partial/Full filesets already present: {fully_matched_filesets}. Partial/Full filesets with mismatch {mismatch_filesets}."
+        console_log(log_text)
         create_log(escape_string(category_text), user, escape_string(log_text), conn)
 
 
@@ -1225,14 +1245,13 @@ def set_perform_match(
                 else:
                     category_text = "Mismatch"
                     log_text = f"Fileset:{fileset_id} mismatched with Fileset:{matched_fileset_id} with status:{status}. Try manual merge."
-                    print_text = f"Merge Fileset:{fileset_id} manually with Fileset:{matched_fileset_id}. Unmatched files: {len(unmatched_files)}."
+                    # print_text = f"Merge Fileset:{fileset_id} manually with Fileset:{matched_fileset_id}. Unmatched files: {len(unmatched_files)}."
                     mismatch_filesets += 1
                     add_manual_merge(
                         [matched_fileset_id],
                         fileset_id,
                         category_text,
                         log_text,
-                        print_text,
                         user,
                         conn,
                     )
@@ -1340,7 +1359,7 @@ def remove_manual_merge(
 
 
 def add_manual_merge(
-    child_filesets, parent_fileset, category_text, log_text, print_text, user, conn
+    child_filesets, parent_fileset, category_text, log_text, user, conn, print_text=None
 ):
     """
     Adds the manual merge entries to a table called possible_merges.
@@ -1356,7 +1375,8 @@ def add_manual_merge(
             cursor.execute(query, (child_fileset, parent_fileset))
 
     create_log(escape_string(category_text), user, escape_string(log_text), conn)
-    print(print_text)
+    if print_text:
+        print(print_text)
 
 
 def is_full_checksum_match(candidate_fileset, fileset, conn):
@@ -1395,14 +1415,15 @@ def is_full_checksum_match(candidate_fileset, fileset, conn):
 
 
 def set_glk_filter_candidate_filesets(
-    fileset_id, fileset, transaction_id, engine_name, conn
+    fileset_id, fileset, fileset_count, transaction_id, engine_name, conn
 ):
     """
     Returns a list of candidate filesets for glk engines that can be merged
     """
     with conn.cursor() as cursor:
         # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches
-
+        fileset_count += 1
+        console_log_candidate_filtering(fileset_count)
         query = """
             WITH candidate_fileset AS ( 
             SELECT fs.id AS fileset_id, f.size
@@ -1469,16 +1490,19 @@ def set_glk_filter_candidate_filesets(
             for row in rows:
                 candidates.append(row["fileset_id"])
 
-        return candidates
+        return (candidates, fileset_count)
 
 
-def set_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn):
+def set_filter_candidate_filesets(
+    fileset_id, fileset, fileset_count, transaction_id, conn
+):
     """
     Returns a list of candidate filesets that can be merged
     """
     with conn.cursor() as cursor:
         # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches
-
+        fileset_count += 1
+        console_log_candidate_filtering(fileset_count)
         query = """
             WITH candidate_fileset AS ( 
             SELECT fs.id AS fileset_id, f.name, f.size
@@ -1536,7 +1560,7 @@ def set_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn):
             for row in rows:
                 candidates.append(row["fileset_id"])
 
-        return candidates
+        return (candidates, fileset_count)
 
 
 def process_fileset(
@@ -2265,3 +2289,33 @@ def add_usercount(fileset, conn):
             cursor.execute(
                 f"UPDATE fileset SET status = 'ReadyForReview' WHERE id = {fileset}"
             )
+
+
+def console_log(message):
+    sys.stdout.write(" " * 50 + "\r")
+    sys.stdout.flush()
+    print(message)
+
+
+def console_log_candidate_filtering(fileset_count):
+    sys.stdout.write(f"Filtering Candidates - Fileset {fileset_count}\r")
+    sys.stdout.flush()
+
+
+def console_log_matching(fileset_count):
+    sys.stdout.write(f"Performing Match - Fileset {fileset_count}\r")
+    sys.stdout.flush()
+
+
+def console_log_detection(fileset_count):
+    sys.stdout.write(f"Processing - Fileset {fileset_count}\r")
+    sys.stdout.flush()
+
+
+def console_log_total_filesets(file_path):
+    count = 0
+    with open(file_path, "r") as f:
+        for line in f:
+            if line.strip().startswith("game ("):
+                count += 1
+    print(f"Total filesets present - {count}.")

From 7dcb20ba4ee3a947c27e28cf02c87d73f2db9731 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Tue, 15 Jul 2025 02:45:15 +0530
Subject: [PATCH 18/30] INTEGRITY: Remove custom recursive path split function.

---
 compute_hash.py | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/compute_hash.py b/compute_hash.py
index 626e622..d63b22f 100644
--- a/compute_hash.py
+++ b/compute_hash.py
@@ -132,25 +132,11 @@ def punycode_need_encode(orig):
         return True
     return False
 
-
-def split_path_recursive(path):
-    parts = []
-    while True:
-        head, tail = os.path.split(path)
-        if tail:
-            parts.insert(0, tail)
-            path = head
-        else:
-            if head:
-                parts.insert(0, head)
-            break
-    return parts
-
 def encode_path_components(filepath):
     """
     Puny encodes all separate components of filepath
     """
-    parts = split_path_recursive(filepath)
+    parts = [i for i in filepath.split(os.sep) if i ]
     encoded_parts = [encode_punycode(p) if punycode_need_encode(p) else p for p in parts]
     return os.path.join(*encoded_parts)
 
@@ -733,7 +719,7 @@ def create_dat_file(hash_of_dirs, path, checksum_size=0):
             file.write("game (\n")
             for filename, (hashes, size, size_r, size_rd, timestamp) in hash_of_dir.items():
                 filename = encode_path_components(filename)
-                data = f"name \"{filename}\" size {size} size-r {size_r} size-rd {size_rd} timestamp {timestamp}"
+                data = f"name '{filename}' size {size} size-r {size_r} size-rd {size_rd} modification-time {timestamp}"
                 for key, value in hashes:
                     data += f" {key} {value}"
 

From 96d9cf473ca63bb83de3b4086efd105112ea46b3 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Tue, 15 Jul 2025 02:49:18 +0530
Subject: [PATCH 19/30] INTEGRITY: Use INFORMATION_SCHEMA.COLUMNS instead of
 relying on error handling for column migration.

---
 schema.py | 132 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 79 insertions(+), 53 deletions(-)

diff --git a/schema.py b/schema.py
index 4b9ba42..9bf42ee 100644
--- a/schema.py
+++ b/schema.py
@@ -164,59 +164,85 @@ def init_database():
         "file_fileset_detection": "CREATE INDEX file_fileset_detection ON file (fileset, detection)",
     }
 
-    try:
-        cursor.execute("ALTER TABLE file ADD COLUMN detection_type VARCHAR(20);")
-    except Exception:
-        # if aleady exists, change the length of the column
-        cursor.execute("ALTER TABLE file MODIFY COLUMN detection_type VARCHAR(20);")
-
-    try:
-        cursor.execute("ALTER TABLE file ADD COLUMN `timestamp` TIMESTAMP NOT NULL;")
-    except Exception:
-        # if aleady exists, change the length of the column
-        cursor.execute("ALTER TABLE file MODIFY COLUMN `timestamp` TIMESTAMP NOT NULL;")
-
-    try:
-        cursor.execute("ALTER TABLE fileset ADD COLUMN `user_count` INT;")
-    except Exception:
-        # if aleady exists, change the length of the column
-        cursor.execute("ALTER TABLE fileset MODIFY COLUMN `user_count` INT;")
-
-    try:
-        cursor.execute("ALTER TABLE file ADD COLUMN punycode_name VARCHAR(200);")
-    except Exception:
-        cursor.execute("ALTER TABLE file MODIFY COLUMN punycode_name VARCHAR(200);")
-
-    try:
-        cursor.execute(
-            "ALTER TABLE file ADD COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';"
-        )
-    except Exception:
-        cursor.execute(
-            "ALTER TABLE file MODIFY COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';"
-        )
-
-    try:
-        cursor.execute(
-            "ALTER TABLE file ADD COLUMN `size-r` BIGINT DEFAULT 0, ADD COLUMN `size-rd` BIGINT DEFAULT 0;"
-        )
-    except Exception:
-        cursor.execute(
-            "ALTER TABLE file MODIFY COLUMN `size-r` BIGINT DEFAULT 0, MODIFY COLUMN `size-rd` BIGINT DEFAULT 0;"
-        )
-    try:
-        cursor.execute("ALTER TABLE log ADD COLUMN `text` varchar(5000);")
-    except Exception:
-        cursor.execute("ALTER TABLE log MODIFY COLUMN `text` varchar(5000);")
-
-    try:
-        cursor.execute(
-            "ALTER TABLE fileset ADD COLUMN set_dat_metadata varchar(5000) DEFAULT 'UTF-8';"
-        )
-    except Exception:
-        cursor.execute(
-            "ALTER TABLE fileset MODIFY COLUMN set_dat_metadata varchar(5000) DEFAULT 'UTF-8';"
-        )
+    def migrate_column(cursor, table_name, column_name, add_sql, modify_sql):
+        query = """
+            SELECT COUNT(*) AS count
+            FROM INFORMATION_SCHEMA.COLUMNS
+            WHERE table_name = %s AND column_name = %s
+        """
+        cursor.execute(query, (table_name, column_name))
+        exists = cursor.fetchone()["count"] > 0
+
+        if exists:
+            print(f"Modifying column '{column_name}' in table '{table_name}'")
+            cursor.execute(modify_sql)
+        else:
+            print(f"Adding column '{column_name}' to table '{table_name}'")
+            cursor.execute(add_sql)
+
+    migrate_column(
+        cursor,
+        "file",
+        "detection_type",
+        "ALTER TABLE file ADD COLUMN detection_type VARCHAR(20);",
+        "ALTER TABLE file MODIFY COLUMN detection_type VARCHAR(20);",
+    )
+
+    migrate_column(
+        cursor,
+        "file",
+        "timestamp",
+        "ALTER TABLE file ADD COLUMN `timestamp` TIMESTAMP NOT NULL;",
+        "ALTER TABLE file MODIFY COLUMN `timestamp` TIMESTAMP NOT NULL;",
+    )
+
+    migrate_column(
+        cursor,
+        "fileset",
+        "user_count",
+        "ALTER TABLE fileset ADD COLUMN `user_count` INT;",
+        "ALTER TABLE fileset MODIFY COLUMN `user_count` INT;",
+    )
+
+    migrate_column(
+        cursor,
+        "file",
+        "punycode_name",
+        "ALTER TABLE file ADD COLUMN punycode_name VARCHAR(200);",
+        "ALTER TABLE file MODIFY COLUMN punycode_name VARCHAR(200);",
+    )
+
+    migrate_column(
+        cursor,
+        "file",
+        "encoding_type",
+        "ALTER TABLE file ADD COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';",
+        "ALTER TABLE file MODIFY COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';",
+    )
+
+    migrate_column(
+        cursor,
+        "file",
+        "size-r",
+        "ALTER TABLE file ADD COLUMN `size-r` BIGINT DEFAULT 0;",
+        "ALTER TABLE file MODIFY COLUMN `size-r` BIGINT DEFAULT 0;",
+    )
+
+    migrate_column(
+        cursor,
+        "file",
+        "size-rd",
+        "ALTER TABLE file ADD COLUMN `size-rd` BIGINT DEFAULT 0;",
+        "ALTER TABLE file MODIFY COLUMN `size-rd` BIGINT DEFAULT 0;",
+    )
+
+    migrate_column(
+        cursor,
+        "log",
+        "text",
+        "ALTER TABLE log ADD COLUMN `text` VARCHAR(5000);",
+        "ALTER TABLE log MODIFY COLUMN `text` VARCHAR(5000);",
+    )
 
     for index, definition in indices.items():
         try:

From 4c9a5e7d992a254df2e3bb8999e1df6abb640e70 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Tue, 15 Jul 2025 02:57:57 +0530
Subject: [PATCH 20/30] INTEGRITY: Add scan processing logic.

---
 db_functions.py | 658 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 658 insertions(+)

diff --git a/db_functions.py b/db_functions.py
index bb6a2a8..683fb63 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -902,6 +902,21 @@ def match_fileset(data_arr, username=None, skiplog=False):
             user,
             skiplog,
         )
+    elif src == "scan":
+        scan_process(
+            game_data,
+            resources,
+            detection,
+            src,
+            conn,
+            transaction_id,
+            filepath,
+            author,
+            version,
+            source_status,
+            user,
+            skiplog,
+        )
     else:
         game_data_lookup = {fs["name"]: fs for fs in game_data}
         for fileset in game_data:
@@ -924,6 +939,628 @@ def match_fileset(data_arr, username=None, skiplog=False):
         )
 
 
+def scan_process(
+    game_data,
+    resources,
+    detection,
+    src,
+    conn,
+    transaction_id,
+    filepath,
+    author,
+    version,
+    source_status,
+    user,
+    skiplog,
+):
+    """
+    Entry point for processing logic for scan.dat.
+    First Pass - Update all files with matching checksum and file size.
+    Second Pass - Filter candidate with matching with filename, filesize and filechecksum
+                - Perform matching.
+    """
+
+    manual_merged_filesets = 0
+    automatic_merged_filesets = 0
+    match_with_full_fileset = 0
+    mismatch_with_full_fileset = 0
+    dropped_early_no_candidate = 0
+    manual_merged_with_detection = 0
+    filesets_with_missing_files = 0
+
+    id_to_fileset_mapping = defaultdict(dict)
+
+    for fileset in game_data:
+        key = calc_key(fileset)
+        megakey = ""
+        log_text = f"State {source_status}."
+
+        (fileset_id, existing) = insert_new_fileset(
+            fileset,
+            conn,
+            detection,
+            src,
+            key,
+            megakey,
+            transaction_id,
+            log_text,
+            user,
+            skiplog=skiplog,
+        )
+        if existing:
+            continue
+
+        id_to_fileset_mapping[fileset_id] = fileset
+
+        # set of filesets whose files got updated
+        filesets_check_for_full = set()
+
+        for rom in fileset["rom"]:
+            scan_update_files(rom, filesets_check_for_full, transaction_id, conn)
+
+    for fileset_id, fileset in id_to_fileset_mapping.items():
+        candidate_filesets = scan_filter_candidate_filesets(
+            fileset_id, fileset, transaction_id, conn
+        )
+
+        if len(candidate_filesets) == 0:
+            category_text = "Drop fileset - No Candidates"
+            fileset_name = fileset["name"] if "name" in fileset else ""
+            fileset_description = (
+                fileset["description"] if "description" in fileset else ""
+            )
+            log_text = f"Drop fileset as no matching candidates. Name: {fileset_name}, Description: {fileset_description}."
+            create_log(
+                escape_string(category_text), user, escape_string(log_text), conn
+            )
+            dropped_early_no_candidate += 1
+            delete_original_fileset(fileset_id, conn)
+            continue
+
+        (
+            automatic_merged_filesets,
+            manual_merged_filesets,
+            match_with_full_fileset,
+            mismatch_with_full_fileset,
+            manual_merged_with_detection,
+            filesets_with_missing_files,
+        ) = scan_perform_match(
+            fileset,
+            src,
+            user,
+            fileset_id,
+            detection,
+            candidate_filesets,
+            automatic_merged_filesets,
+            manual_merged_filesets,
+            match_with_full_fileset,
+            mismatch_with_full_fileset,
+            manual_merged_with_detection,
+            filesets_with_missing_files,
+            conn,
+            skiplog,
+        )
+
+    # Final log
+    with conn.cursor() as cursor:
+        cursor.execute(
+            "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s",
+            (transaction_id,),
+        )
+        fileset_insertion_count = cursor.fetchone()["COUNT(fileset)"]
+        category_text = f"Uploaded from {src}"
+        log_text = f"Completed loading DAT file, filename {filepath}, size {os.path.getsize(filepath)}. State {source_status}. Number of filesets: {fileset_insertion_count}. Transaction: {transaction_id}"
+        create_log(escape_string(category_text), user, escape_string(log_text), conn)
+        category_text = "Upload information"
+        log_text = f"Number of filesets: {fileset_insertion_count}. Filesets automatically merged: {automatic_merged_filesets}. Filesets requiring manual merge (multiple candidates): {manual_merged_filesets}. Filesets requiring manual merge (matched with detection): {manual_merged_with_detection}. Filesets dropped, no candidate: {dropped_early_no_candidate}. Filesets matched with existing Full fileset: {match_with_full_fileset}. Filesets with mismatched files with Full fileset: {mismatch_with_full_fileset}. Filesets missing files compared to partial fileset candidate: {filesets_with_missing_files}."
+        create_log(escape_string(category_text), user, escape_string(log_text), conn)
+
+
+def scan_update_files(rom, filesets_check_for_full, transaction_id, conn):
+    """
+    Updates all the checksums for the files matching by a checksum and size.
+    """
+    with conn.cursor() as cursor:
+        checksums = defaultdict(str)
+        for key in rom:
+            if key not in ["name", "size", "size-r", "size-rd", "modification-time"]:
+                checksums[key] = rom[key]
+
+        files_to_update = set()
+
+        for _, checksum in checksums.items():
+            query = """
+                SELECT f.id as file_id, fs.id as fileset_id
+                FROM file f
+                JOIN filechecksum fc ON fc.file = f.id
+                JOIN fileset fs ON fs.id = f.fileset
+                JOIN transactions t ON t.fileset = fs.id
+                WHERE fc.checksum = %s
+                AND f.size = %s
+                AND f.`size-r` = %s
+                AND f.`size-rd` = %s
+                AND t.transaction != %s
+            """
+            size = rom["size"] if "size" in rom else 0
+            size_r = rom["size-r"] if "size-r" in rom else 0
+            size_rd = rom["size-rd"] if "size-rd" in rom else 0
+            cursor.execute(query, (checksum, size, size_r, size_rd, transaction_id))
+            result = cursor.fetchall()
+            if result:
+                for file in result:
+                    filesets_check_for_full.add(file["fileset_id"])
+                    files_to_update.add(file["file_id"])
+
+        for file_id in files_to_update:
+            query = """
+                DELETE FROM filechecksum
+                WHERE file = %s
+            """
+            cursor.execute(query, (file_id,))
+            for check, checksum in checksums.items():
+                checksize, checktype, checksum = get_checksum_props(check, checksum)
+                query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)"
+                cursor.execute(query, (file_id, checksize, checktype, checksum))
+
+        conn.commit()
+
+
+def scan_perform_match(
+    fileset,
+    src,
+    user,
+    fileset_id,
+    detection,
+    candidate_filesets,
+    automatic_merged_filesets,
+    manual_merged_filesets,
+    match_with_full_fileset,
+    mismatch_with_full_fileset,
+    manual_merged_with_detection,
+    filesets_with_missing_files,
+    conn,
+    skiplog,
+):
+    """
+    Performs matching for scan.dat.
+    If single candidate for match:
+        detection -> Copy all the files and checksums from scan.
+        partial -> Copy all the files and checksums from scan.
+        full -> Drop the scan fileset. But show the differences in file if any.
+    If more than one candidate for match:
+        Put them for manual merge.
+    """
+    with conn.cursor() as cursor:
+        if len(candidate_filesets) == 1:
+            matched_fileset_id = candidate_filesets[0]
+            cursor.execute(
+                "SELECT status FROM fileset WHERE id = %s", (matched_fileset_id,)
+            )
+            status = cursor.fetchone()["status"]
+            # Partial filesets can be turned full directly, as the files have already been updated.
+            # But the files that had missing size were not updated, so we need to check.
+            if status == "partial":
+                # Partial filesets contain all the files, so does the scanned filesets, so this case should not ideally happen.
+                if total_files(matched_fileset_id, conn) > total_fileset_files(fileset):
+                    category_text = "Missing files"
+                    log_text = f"Missing files in Fileset:{fileset_id}. Try manual merge with Fileset:{matched_fileset_id}."
+                    add_manual_merge(
+                        candidate_filesets,
+                        fileset_id,
+                        category_text,
+                        log_text,
+                        user,
+                        conn,
+                        log_text,
+                    )
+                    filesets_with_missing_files += 1
+
+                else:
+                    update_all_files(fileset, matched_fileset_id, False, conn)
+                    update_fileset_status(cursor, matched_fileset_id, "full")
+                    if not skiplog:
+                        log_matched_fileset(
+                            src,
+                            fileset_id,
+                            matched_fileset_id,
+                            "full",
+                            user,
+                            conn,
+                        )
+                    delete_original_fileset(fileset_id, conn)
+                    automatic_merged_filesets += 1
+
+            # Detection filests can be turned full if the number of files are equal,
+            # otherwise we do manual merge to remove extra files.
+            elif status == "detection":
+                if total_fileset_files(fileset) == total_files(
+                    matched_fileset_id, conn, detection_only=True
+                ):
+                    update_all_files(fileset, matched_fileset_id, True, conn)
+                    update_fileset_status(cursor, matched_fileset_id, "full")
+                    if not skiplog:
+                        log_matched_fileset(
+                            src,
+                            fileset_id,
+                            matched_fileset_id,
+                            "full",
+                            user,
+                            conn,
+                        )
+                        delete_original_fileset(fileset_id, conn)
+                        automatic_merged_filesets += 1
+
+                else:
+                    category_text = "Manual Merge - Detection found"
+                    log_text = f"Matched with detection. Merge Fileset:{fileset_id} manually with Fileset:{matched_fileset_id}."
+                    add_manual_merge(
+                        candidate_filesets,
+                        fileset_id,
+                        category_text,
+                        log_text,
+                        user,
+                        conn,
+                        log_text,
+                    )
+                    manual_merged_with_detection += 1
+
+            # Drop the fileset, note down the file differences
+            elif status == "full":
+                (unmatched_candidate_files, unmatched_scan_files) = get_unmatched_files(
+                    matched_fileset_id, fileset, conn
+                )
+                fully_matched = (
+                    True
+                    if len(unmatched_candidate_files) == 0
+                    and len(unmatched_scan_files) == 0
+                    else False
+                )
+                if fully_matched:
+                    match_with_full_fileset += 1
+                else:
+                    mismatch_with_full_fileset += 1
+                log_scan_match_with_full(
+                    fileset_id,
+                    matched_fileset_id,
+                    unmatched_candidate_files,
+                    unmatched_scan_files,
+                    fully_matched,
+                    user,
+                    conn,
+                )
+                delete_original_fileset(fileset_id, conn)
+
+        elif len(candidate_filesets) > 1:
+            category_text = "Manual Merge - Multiple Candidates"
+            log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidate_filesets)}."
+            manual_merged_filesets += 1
+            add_manual_merge(
+                candidate_filesets,
+                fileset_id,
+                category_text,
+                log_text,
+                user,
+                conn,
+                log_text,
+            )
+
+    return (
+        automatic_merged_filesets,
+        manual_merged_filesets,
+        match_with_full_fileset,
+        mismatch_with_full_fileset,
+        manual_merged_with_detection,
+        filesets_with_missing_files,
+    )
+
+
+def update_all_files(fileset, candidate_fileset_id, is_candidate_detection, conn):
+    """
+    Updates all the files, if they were missed out earlier due to missing size.
+    """
+    with conn.cursor() as cursor:
+        # Extracting the filename from the filepath.
+        cursor.execute(
+            f"SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = {candidate_fileset_id}"
+        )
+        target_files = cursor.fetchall()
+        candidate_files = {
+            target_file["id"]: target_file["name"].lower()
+            for target_file in target_files
+        }
+
+        scan_checksums = set()
+        scan_names_by_checksum = defaultdict(str)
+        same_filename_count = defaultdict(int)
+
+        filename_to_filepath_map = defaultdict(str)
+        filepath_to_checksum_map = defaultdict(dict)
+        filepath_to_sizes_map = defaultdict(dict)
+
+        for file in fileset["rom"]:
+            base_name = os.path.basename(normalised_path(file["name"])).lower()
+            checksums = defaultdict(str)
+            sizes = defaultdict(int)
+            for key in file:
+                if key.startswith("md5"):
+                    scan_checksums.add((file[key], base_name))
+                    scan_names_by_checksum[(file[key], base_name)] = file["name"]
+                    checksums[key] = file[key]
+                if key.startswith("size"):
+                    sizes[key] = file[key]
+
+            filepath_to_sizes_map[file["name"]] = sizes
+            filepath_to_checksum_map[file["name"]] = checksums
+            same_filename_count[base_name] += 1
+            filename_to_filepath_map[base_name] = file["name"]
+
+        checksums = defaultdict(dict)
+        filepath = ""
+
+        for file_id, file_name in candidate_files.items():
+            file_name = file_name.lower()
+            # Match by filename
+            if same_filename_count[file_name] == 1:
+                filepath = filename_to_filepath_map[file_name]
+                checksums = filepath_to_checksum_map[filepath]
+
+            # If same filename occurs multiple times, fallback to checksum based match
+            else:
+                cursor.execute(
+                    "SELECT checksum FROM filechecksum WHERE file = %s", (file_id,)
+                )
+                checksum_rows = cursor.fetchall()
+                for row in checksum_rows:
+                    checksum = row["checksum"]
+                    if (checksum, file_name) in scan_checksums:
+                        filepath = scan_names_by_checksum[(checksum, file_name)]
+                        checksums = filepath_to_checksum_map[filepath]
+
+            # Delete older checksums
+            query = """
+                DELETE FROM filechecksum
+                WHERE file = %s
+            """
+            cursor.execute(query, (file_id,))
+            # Update the checksums
+            for key, checksum in checksums.items():
+                checksize, checktype, checksum = get_checksum_props(key, checksum)
+                query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)"
+                cursor.execute(query, (file_id, checksize, checktype, checksum))
+
+            # Also updates the sizes, do not update the name if fileset not in detection state
+            query = """
+                UPDATE file
+                SET size = %s,
+                `size-r` = %s,
+                `size-rd` = %s
+            """
+            sizes = filepath_to_sizes_map[filepath]
+            print(sizes)
+            if is_candidate_detection:
+                query += ",name = %s WHERE id = %s"
+                params = (
+                    sizes["size"],
+                    sizes["size-r"],
+                    sizes["size-rd"],
+                    normalised_path(filepath),
+                    file_id,
+                )
+            else:
+                query += "WHERE id = %s"
+                params = (sizes["size"], sizes["size-r"], sizes["size-rd"], file_id)
+            cursor.execute(query, params)
+
+
+def total_files(fileset_id, conn, detection_only=False):
+    """
+    Returns the total number of files (only detection files if detection_only set to true) present in the given fileset from the database.
+    """
+    with conn.cursor() as cursor:
+        query = """
+            SELECT COUNT(*) AS count
+            FROM file f
+            JOIN fileset fs ON fs.id = f.fileset
+        """
+        if detection_only:
+            query += """
+                WHERE f.detection = 1
+                AND fs.id = %s
+            """
+        else:
+            query += "WHERE fs.id = %s"
+        cursor.execute(query, (fileset_id,))
+        return cursor.fetchone()["count"]
+
+
+def total_fileset_files(fileset):
+    """
+    Returns the number of files present in the fileset
+    """
+    return len(fileset["rom"])
+
+
+def scan_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn):
+    """
+    Returns a list of candidate filesets that can be merged
+    """
+    with conn.cursor() as cursor:
+        # Returns those filesets which have all detection files matching in the scan fileset filtered by file name and file size(if not -1).
+
+        query = """
+            WITH candidate_fileset AS (
+            SELECT fs.id AS fileset_id, f.name, f.size,
+            f.`size-r` AS size_r, f.`size-rd` AS size_rd
+            FROM file f
+            JOIN fileset fs ON f.fileset = fs.id
+            JOIN game g ON g.id = fs.game
+            JOIN transactions t ON t.fileset = fs.id
+            WHERE f.detection = 1
+            AND t.transaction != %s
+            ),
+            total_detection_files AS (
+            SELECT cf.fileset_id, COUNT(*) AS detection_files_found
+            FROM candidate_fileset cf
+            GROUP BY fileset_id
+            ),
+            set_fileset AS (
+            SELECT name, size,
+            `size-r` AS size_r, `size-rd` AS size_rd
+            FROM file
+            WHERE fileset = %s
+            ),
+            matched_detection_files AS (
+            SELECT cf.fileset_id, COUNT(*) AS match_files_count
+            FROM candidate_fileset cf
+            JOIN set_fileset sf ON ( (
+                cf.name = sf.name
+                OR
+                REGEXP_REPLACE(cf.name, '^.*[\\\\/]', '') = REGEXP_REPLACE(sf.name, '^.*[\\\\/]', '')
+            ) AND (cf.size = sf.size OR cf.size = -1)
+            AND (cf.size_r = sf.size_r)
+            AND (cf.size_rd = sf.size_rd))
+            GROUP BY cf.fileset_id
+            ),
+            valid_matched_detection_files AS (
+            SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
+            FROM matched_detection_files mdf
+            JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
+            WHERE tdf.detection_files_found <= mdf.match_files_count
+            ),
+            max_match_count AS (
+                SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
+            )
+            SELECT vmdf.fileset_id
+            FROM valid_matched_detection_files vmdf
+            JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
+            JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
+        """
+
+        cursor.execute(query, (transaction_id, fileset_id))
+        rows = cursor.fetchall()
+
+        candidates = []
+        if rows:
+            for row in rows:
+                candidates.append(row["fileset_id"])
+
+        for candidate in candidates:
+            if not is_full_detection_checksum_match(candidate, fileset, conn):
+                candidates.remove(candidate)
+
+        return candidates
+
+
+def get_unmatched_files(candidate_fileset, fileset, conn):
+    """
+    Checks if all checksums from candidate_fileset match scan file checksums.
+    Returns:
+    unmatched_candidate_files: candidate files whose checksums weren't found in scan
+    unmatched_scan_files: scan files whose checksums weren't matched by candidate
+    """
+    with conn.cursor() as cursor:
+        cursor.execute(
+            "SELECT id, name FROM file WHERE fileset = %s", (candidate_fileset,)
+        )
+        candidate_file_rows = cursor.fetchall()
+        candidate_files = {row["id"]: row["name"] for row in candidate_file_rows}
+
+        scan_checksums = set()
+        scan_names_by_checksum = {}
+
+        for file in fileset["rom"]:
+            base_name = os.path.basename(normalised_path(file["name"])).lower()
+            for key in file:
+                if key.startswith("md5"):
+                    scan_checksums.add((file[key], base_name))
+                    scan_names_by_checksum[(file[key], base_name)] = file["name"]
+
+        unmatched_candidate_files = []
+        matched_scan_pairs = set()
+
+        for file_id, file_name in candidate_files.items():
+            cursor.execute(
+                "SELECT checksum FROM filechecksum WHERE file = %s", (file_id,)
+            )
+            checksum_rows = cursor.fetchall()
+
+            base_name = os.path.basename(file_name).lower()
+            match_found = False
+
+            for row in checksum_rows:
+                checksum = row["checksum"]
+                if (checksum, base_name) in scan_checksums:
+                    matched_scan_pairs.add((checksum, base_name))
+                    match_found = True
+
+            if not match_found:
+                unmatched_candidate_files.append(file_name)
+
+        unmatched_scan_files = {
+            scan_names_by_checksum[key]
+            for key in scan_checksums
+            if key not in matched_scan_pairs
+        }
+        unmatched_scan_files = list(unmatched_scan_files)
+
+        return (unmatched_candidate_files, unmatched_scan_files)
+
+
+def is_full_detection_checksum_match(candidate_fileset, fileset, conn):
+    """
+    Return type - Boolean
+    Checks if all the detection files in the candidate fileset have corresponding checksums matching with scan.
+
+    scan -	rom ( name "AFM Read Me!_2" size 8576 size-r 1 size-rd 0 modification-time 1993-05-12 md5 dsd16ccea050db521a678a1cdc33794c md5-5000 008e76ec3ae58d0add637ea7aa299a2a md5-t-5000 118e76ec3ae58d0add637ea7aa299a2c md5-1048576 37d16ccea050db521a678a1cdc33794c)
+    """
+    with conn.cursor() as cursor:
+        cursor.execute(
+            "SELECT id, name FROM file WHERE detection=1 AND fileset = %s",
+            (candidate_fileset,),
+        )
+        target_files = cursor.fetchall()
+        candidate_files = {
+            target_file["id"]: target_file["name"] for target_file in target_files
+        }
+
+        # set of (checksum, filename)
+        scan_checksums = set()
+        for file in fileset["rom"]:
+            for key in file:
+                if key.startswith("md5"):
+                    name = os.path.basename(normalised_path(file["name"]))
+                    scan_checksums.add((file[key], name.lower()))
+
+        for detection_file_id, detection_file_name in candidate_files.items():
+            query = """
+                    SELECT fc.checksum, fc.checksize, fc.checktype
+                    FROM filechecksum fc
+                    WHERE fc.file = %s
+                """
+            cursor.execute(query, (detection_file_id,))
+            checksums_info = cursor.fetchall()
+            match_found = False
+            if checksums_info:
+                for checksum_info in checksums_info:
+                    checksum = checksum_info["checksum"]
+                    if (
+                        checksum,
+                        os.path.basename(detection_file_name.lower()),
+                    ) not in scan_checksums:
+                        match_found = True
+                        break
+
+            if match_found:
+                return False
+
+        return True
+
+
+# -------------------------------------------------------------------------------------------------------
+# Set.dat processing below
+# -------------------------------------------------------------------------------------------------------
+
+
 def set_process(
     game_data,
     resources,
@@ -2085,6 +2722,27 @@ def log_matched_fileset(src, fileset_last, fileset_id, state, user, conn):
     update_history(fileset_last, fileset_id, conn, log_last)
 
 
+def log_scan_match_with_full(
+    fileset_last,
+    candidate_id,
+    unmatched_candidate_files,
+    unmatched_scan_files,
+    fully_matched,
+    user,
+    conn,
+):
+    category_text = "Mismatch with Full set"
+    if fully_matched:
+        category_text = "Existing as Full set."
+    log_text = f"""Files mismatched with Full Fileset:{candidate_id}. Unmatched Files in scan fileset = {len(unmatched_scan_files)}. Unmatched Files in full fileset = {len(unmatched_candidate_files)}. List of unmatched files scan.dat : {", ".join(scan_file for scan_file in unmatched_scan_files)}, List of unmatched files full fileset : {", ".join(scan_file for scan_file in unmatched_candidate_files)}"""
+    if fully_matched:
+        log_text = (
+            f"Fileset matched completely with Full Fileset:{candidate_id}. Dropping."
+        )
+    print(log_text)
+    create_log(escape_string(category_text), user, escape_string(log_text), conn)
+
+
 def finalize_fileset_insertion(
     conn, transaction_id, src, filepath, author, version, source_status, user
 ):

From 33cac5a068e6c0360c37c34a46afaa7ac2da9447 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Tue, 15 Jul 2025 03:02:14 +0530
Subject: [PATCH 21/30] INTEGRITY: Add additional modification-time column in
 file table.

---
 dat_parser.py   |  3 +++
 db_functions.py |  7 +++++--
 fileset.py      |  9 +++------
 schema.py       | 16 ++++++++++++++++
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/dat_parser.py b/dat_parser.py
index b3ce12e..a76480b 100644
--- a/dat_parser.py
+++ b/dat_parser.py
@@ -33,6 +33,9 @@ def map_checksum_data(content_string):
         elif tokens[i] == "size-rd":
             current_rom["size-rd"] = int(tokens[i + 1])
             i += 2
+        elif tokens[i] == "modification-time":
+            current_rom["modification-time"] = tokens[i + 1]
+            i += 2
         else:
             checksum_key = tokens[i]
             checksum_value = tokens[i + 1] if len(tokens) >= 6 else "0"
diff --git a/db_functions.py b/db_functions.py
index 683fb63..408ea29 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -234,13 +234,16 @@ def insert_file(file, detection, src, conn):
     values.append(file["size-r"] if "size-r" in file else "0")
     values.append(file["size-rd"] if "size-rd" in file else "0")
 
+    modification_time = file["modification-time"] if "modification-time" in file else ""
+    values.append(modification_time)
+
     values.extend([checksum, detection, detection_type])
 
     # Parameterised Query
     placeholders = (
-        ["%s"] * (len(values[:5])) + ["@fileset_last"] + ["%s"] * 2 + ["NOW()"]
+        ["%s"] * (len(values[:6])) + ["@fileset_last"] + ["%s"] * 2 + ["NOW()"]
     )
-    query = f"INSERT INTO file ( name, size, `size-r`, `size-rd`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ({', '.join(placeholders)})"
+    query = f"INSERT INTO file ( name, size, `size-r`, `size-rd`, `modification-time`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ({', '.join(placeholders)})"
 
     with conn.cursor() as cursor:
         cursor.execute(query, values)
diff --git a/fileset.py b/fileset.py
index c8e9cf7..a930f54 100644
--- a/fileset.py
+++ b/fileset.py
@@ -189,7 +189,6 @@ def fileset():
                     query = """SELECT game.name as 'game name', engineid, gameid, extra, platform, language, fileset.set_dat_metadata FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = %s"""
                 else:
                     query = """SELECT game.name as 'game name', engineid, gameid, extra, platform, language FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = %s"""
-                print(query)
                 cursor.execute(query, (id,))
                 result = {**result, **cursor.fetchone()}
             else:
@@ -240,6 +239,7 @@ def fileset():
                 "detection",
                 "detection_type",
                 "timestamp",
+                "modification-time",
             ]
 
             if sort:
@@ -250,13 +250,10 @@ def fileset():
                     if "desc" in sort:
                         order += " DESC"
 
-            columns_to_select = "file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp`"
+            columns_to_select = "file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp`, `modification-time`"
             columns_to_select += ", ".join(md5_columns)
-            print(
-                f"SELECT file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp` FROM file WHERE fileset = {id} {order}"
-            )
             cursor.execute(
-                f"SELECT file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp` FROM file WHERE fileset = {id} {order}"
+                f"SELECT file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp`, `modification-time` FROM file WHERE fileset = {id} {order}"
             )
             result = cursor.fetchall()
 
diff --git a/schema.py b/schema.py
index 9bf42ee..776eada 100644
--- a/schema.py
+++ b/schema.py
@@ -244,6 +244,22 @@ def migrate_column(cursor, table_name, column_name, add_sql, modify_sql):
         "ALTER TABLE log MODIFY COLUMN `text` VARCHAR(5000);",
     )
 
+    migrate_column(
+        cursor,
+        "fileset",
+        "set_dat_metadata",
+        "ALTER TABLE fileset ADD COLUMN set_dat_metadata VARCHAR(5000) DEFAULT '';",
+        "ALTER TABLE fileset MODIFY COLUMN set_dat_metadata VARCHAR(5000) DEFAULT '';",
+    )
+
+    migrate_column(
+        cursor,
+        "file",
+        "modification-time",
+        "ALTER TABLE file ADD COLUMN `modification-time` VARCHAR(100) DEFAULT '';",
+        "ALTER TABLE file MODIFY COLUMN `modification-time` VARCHAR(100) DEFAULT '';",
+    )
+
     for index, definition in indices.items():
         try:
             cursor.execute(definition)

From 90ffe1a37ec8d1de4311c4cefd6cb248a4e78ac6 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Tue, 15 Jul 2025 03:04:24 +0530
Subject: [PATCH 22/30] INTEGRITY: Additional error handling while extracing
 keys from scummvm.dat.

---
 db_functions.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 408ea29..f9db3bc 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -546,13 +546,19 @@ def db_insert(data_arr, username=None, skiplog=False):
         megakey = calc_megakey(fileset)
 
         if detection:
-            engine_name = fileset["engine"]
-            engineid = fileset["sourcefile"]
-            gameid = fileset["name"]
-            title = fileset["title"]
-            extra = fileset["extra"]
-            platform = fileset["platform"]
-            lang = fileset["language"]
+            try:
+                engine_name = fileset.get("engine", "")
+                engineid = fileset["sourcefile"]
+                gameid = fileset["name"]
+                title = fileset.get("title", "")
+                extra = fileset.get("extra", "")
+                platform = fileset.get("platform", "")
+                lang = fileset.get("language", "")
+            except KeyError as e:
+                print(
+                    f"Missing key in header: {e} for {fileset.get('name', '')}-{fileset.get('language', '')}-{fileset.get('platform', '')}"
+                )
+                return
 
             with conn.cursor() as cursor:
                 query = """

From 493acb524bc3fcff22c05f6985796d8ff37757ac Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Tue, 15 Jul 2025 04:29:37 +0530
Subject: [PATCH 23/30] INTEGRITY: Traverse set.dat instead of candidate
 fileset while searching mismatched files.

---
 db_functions.py | 49 ++++++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index f9db3bc..6906a64 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -281,8 +281,6 @@ def add_all_equal_checksums(checksize, checktype, checksum, file_id, conn):
         size_name = "size"
         if checktype[-1] == "r":
             size_name += "-rd"
-        if checktype[-1] == "s":
-            size_name += "-d"
 
         cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = {file_id}")
         result = cursor.fetchone()
@@ -1345,7 +1343,6 @@ def update_all_files(fileset, candidate_fileset_id, is_candidate_detection, conn
                 `size-rd` = %s
             """
             sizes = filepath_to_sizes_map[filepath]
-            print(sizes)
             if is_candidate_detection:
                 query += ",name = %s WHERE id = %s"
                 params = (
@@ -1462,10 +1459,10 @@ def scan_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn):
 
 def get_unmatched_files(candidate_fileset, fileset, conn):
     """
-    Checks if all checksums from candidate_fileset match scan file checksums.
+    Checks if all checksums from candidate_fileset match dat file checksums.
     Returns:
     unmatched_candidate_files: candidate files whose checksums weren't found in scan
-    unmatched_scan_files: scan files whose checksums weren't matched by candidate
+    unmatched_dat_files: dat files whose checksums weren't matched by candidate
     """
     with conn.cursor() as cursor:
         cursor.execute(
@@ -1474,18 +1471,18 @@ def get_unmatched_files(candidate_fileset, fileset, conn):
         candidate_file_rows = cursor.fetchall()
         candidate_files = {row["id"]: row["name"] for row in candidate_file_rows}
 
-        scan_checksums = set()
-        scan_names_by_checksum = {}
+        dat_checksums = set()
+        dat_names_by_checksum = {}
 
         for file in fileset["rom"]:
             base_name = os.path.basename(normalised_path(file["name"])).lower()
             for key in file:
                 if key.startswith("md5"):
-                    scan_checksums.add((file[key], base_name))
-                    scan_names_by_checksum[(file[key], base_name)] = file["name"]
+                    dat_checksums.add((file[key], base_name))
+                    dat_names_by_checksum[(file[key], base_name)] = file["name"]
 
         unmatched_candidate_files = []
-        matched_scan_pairs = set()
+        matched_dat_pairs = set()
 
         for file_id, file_name in candidate_files.items():
             cursor.execute(
@@ -1498,21 +1495,21 @@ def get_unmatched_files(candidate_fileset, fileset, conn):
 
             for row in checksum_rows:
                 checksum = row["checksum"]
-                if (checksum, base_name) in scan_checksums:
-                    matched_scan_pairs.add((checksum, base_name))
+                if (checksum, base_name) in dat_checksums:
+                    matched_dat_pairs.add((checksum, base_name))
                     match_found = True
 
             if not match_found:
                 unmatched_candidate_files.append(file_name)
 
-        unmatched_scan_files = {
-            scan_names_by_checksum[key]
-            for key in scan_checksums
-            if key not in matched_scan_pairs
+        unmatched_dat_files = {
+            dat_names_by_checksum[key]
+            for key in dat_checksums
+            if key not in matched_dat_pairs
         }
-        unmatched_scan_files = list(unmatched_scan_files)
+        unmatched_dat_files = list(unmatched_dat_files)
 
-        return (unmatched_candidate_files, unmatched_scan_files)
+        return (unmatched_candidate_files, unmatched_dat_files)
 
 
 def is_full_detection_checksum_match(candidate_fileset, fileset, conn):
@@ -1524,7 +1521,7 @@ def is_full_detection_checksum_match(candidate_fileset, fileset, conn):
     """
     with conn.cursor() as cursor:
         cursor.execute(
-            "SELECT id, name FROM file WHERE detection=1 AND fileset = %s",
+            "SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name FROM file WHERE detection=1 AND fileset = %s",
             (candidate_fileset,),
         )
         target_files = cursor.fetchall()
@@ -1682,7 +1679,7 @@ def set_process(
     console_message = "Candidate filtering finished."
     console_log(console_message)
     console_message = (
-        f"{dropped_early_no_candidate} Filesets Dropped - No candidates found."
+        f"{dropped_early_no_candidate} Filesets Dropped for No candidates."
     )
     console_log(console_message)
     console_message = "Looking for duplicates..."
@@ -1872,9 +1869,15 @@ def set_perform_match(
                     matched_fileset_id, manual_merge_map, set_to_candidate_dict, conn
                 )
             elif status == "partial" or status == "full":
-                (is_match, unmatched_files) = is_full_checksum_match(
+                (unmatched_candidate_files, unmatched_dat_files) = get_unmatched_files(
                     matched_fileset_id, fileset, conn
                 )
+                is_match = (
+                    True
+                    if len(unmatched_candidate_files) == 0
+                    and len(unmatched_dat_files) == 0
+                    else False
+                )
                 if is_match:
                     category_text = "Already present"
                     log_text = f"Already present as - Fileset:{matched_fileset_id}. Deleting Fileset:{fileset_id}"
@@ -1890,7 +1893,8 @@ def set_perform_match(
 
                 else:
                     category_text = "Mismatch"
-                    log_text = f"Fileset:{fileset_id} mismatched with Fileset:{matched_fileset_id} with status:{status}. Try manual merge."
+                    log_text = f"Fileset:{fileset_id} mismatched with Fileset:{matched_fileset_id} with status:{status}. Try manual merge. Unmatched Files in set.dat fileset = {len(unmatched_dat_files)} Unmatched Files in candidate fileset = {len(unmatched_candidate_files)}. List of unmatched files scan.dat : {', '.join(scan_file for scan_file in unmatched_dat_files)}, List of unmatched files full fileset : {', '.join(scan_file for scan_file in unmatched_candidate_files)}"
+                    console_log(log_text)
                     # print_text = f"Merge Fileset:{fileset_id} manually with Fileset:{matched_fileset_id}. Unmatched files: {len(unmatched_files)}."
                     mismatch_filesets += 1
                     add_manual_merge(
@@ -1904,7 +1908,6 @@ def set_perform_match(
 
         elif len(candidate_filesets) > 1:
             found_match = False
-
             for candidate_fileset in candidate_filesets:
                 (is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn)
                 if is_match:

From ff9934f5cfb55ce58125c4d332e185e00183b2b1 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Wed, 16 Jul 2025 18:46:46 +0530
Subject: [PATCH 24/30] INTEGRITY: Add checksum based filtering in set.dat,
 when possible.

---
 db_functions.py | 134 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 97 insertions(+), 37 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 6906a64..3a4fc12 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -977,7 +977,9 @@ def scan_process(
 
     id_to_fileset_mapping = defaultdict(dict)
 
+    fileset_count = 0
     for fileset in game_data:
+        console_log_file_update(fileset_count)
         key = calc_key(fileset)
         megakey = ""
         log_text = f"State {source_status}."
@@ -1003,9 +1005,12 @@ def scan_process(
         filesets_check_for_full = set()
 
         for rom in fileset["rom"]:
-            scan_update_files(rom, filesets_check_for_full, transaction_id, conn)
+            pre_update_files(rom, filesets_check_for_full, transaction_id, conn)
+        fileset_count += 1
 
+    fileset_count = 0
     for fileset_id, fileset in id_to_fileset_mapping.items():
+        console_log_matching(fileset_count)
         candidate_filesets = scan_filter_candidate_filesets(
             fileset_id, fileset, transaction_id, conn
         )
@@ -1047,6 +1052,7 @@ def scan_process(
             conn,
             skiplog,
         )
+        fileset_count += 1
 
     # Final log
     with conn.cursor() as cursor:
@@ -1063,7 +1069,7 @@ def scan_process(
         create_log(escape_string(category_text), user, escape_string(log_text), conn)
 
 
-def scan_update_files(rom, filesets_check_for_full, transaction_id, conn):
+def pre_update_files(rom, filesets_check_for_full, transaction_id, conn):
     """
     Updates all the checksums for the files matching by a checksum and size.
     """
@@ -1074,6 +1080,9 @@ def scan_update_files(rom, filesets_check_for_full, transaction_id, conn):
                 checksums[key] = rom[key]
 
         files_to_update = set()
+        size = rom["size"] if "size" in rom else 0
+        size_r = rom["size-r"] if "size-r" in rom else 0
+        size_rd = rom["size-rd"] if "size-rd" in rom else 0
 
         for _, checksum in checksums.items():
             query = """
@@ -1088,9 +1097,7 @@ def scan_update_files(rom, filesets_check_for_full, transaction_id, conn):
                 AND f.`size-rd` = %s
                 AND t.transaction != %s
             """
-            size = rom["size"] if "size" in rom else 0
-            size_r = rom["size-r"] if "size-r" in rom else 0
-            size_rd = rom["size-rd"] if "size-rd" in rom else 0
+
             cursor.execute(query, (checksum, size, size_r, size_rd, transaction_id))
             result = cursor.fetchall()
             if result:
@@ -1104,12 +1111,20 @@ def scan_update_files(rom, filesets_check_for_full, transaction_id, conn):
                 WHERE file = %s
             """
             cursor.execute(query, (file_id,))
+            # Update checksums
             for check, checksum in checksums.items():
                 checksize, checktype, checksum = get_checksum_props(check, checksum)
                 query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)"
                 cursor.execute(query, (file_id, checksize, checktype, checksum))
-
-        conn.commit()
+            # Update sizes
+            query = """
+                UPDATE file
+                SET size = %s,
+                `size-r` = %s,
+                `size-rd` = %s,
+                WHERE id = %s
+            """
+            cursor.execute(query, size, size_r, size_rd, file_id)
 
 
 def scan_perform_match(
@@ -1907,31 +1922,7 @@ def set_perform_match(
                     )
 
         elif len(candidate_filesets) > 1:
-            found_match = False
-            for candidate_fileset in candidate_filesets:
-                (is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn)
-                if is_match:
-                    update_fileset_status(cursor, candidate_fileset, "partial")
-                    set_populate_file(fileset, candidate_fileset, conn, detection)
-                    auto_merged_filesets += 1
-                    if not skiplog:
-                        log_matched_fileset(
-                            src,
-                            fileset_id,
-                            candidate_fileset,
-                            "partial",
-                            user,
-                            conn,
-                        )
-                    delete_original_fileset(fileset_id, conn)
-                    remove_manual_merge_if_size_mismatch(
-                        candidate_fileset, manual_merge_map, set_to_candidate_dict, conn
-                    )
-                    found_match = True
-                    break
-
-            if not found_match:
-                manual_merge_map[fileset_id] = candidate_filesets
+            manual_merge_map[fileset_id] = candidate_filesets
 
     return (
         fully_matched_filesets,
@@ -2160,8 +2151,7 @@ def set_filter_candidate_filesets(
             JOIN game g ON g.id = fs.game
             JOIN engine e ON e.id = g.engine
             JOIN transactions t ON t.fileset = fs.id
-            WHERE fs.id != %s
-            AND e.engineid = %s
+            WHERE e.engineid = %s
             AND f.detection = 1
             AND t.transaction != %s
             ),
@@ -2199,9 +2189,7 @@ def set_filter_candidate_filesets(
             JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
         """
 
-        cursor.execute(
-            query, (fileset_id, fileset["sourcefile"], transaction_id, fileset_id)
-        )
+        cursor.execute(query, (fileset["sourcefile"], transaction_id, fileset_id))
         rows = cursor.fetchall()
 
         candidates = []
@@ -2209,9 +2197,76 @@ def set_filter_candidate_filesets(
             for row in rows:
                 candidates.append(row["fileset_id"])
 
+        matched_candidates = []
+
+        candidates = [
+            candidate
+            for candidate in candidates
+            if is_candidate_by_checksize(candidate, fileset, conn)
+        ]
+
+        for candidate in candidates:
+            if is_full_detection_checksum_match(candidate, fileset, conn):
+                matched_candidates.append(candidate)
+
+        if len(matched_candidates) != 0:
+            candidates = matched_candidates
+
         return (candidates, fileset_count)
 
 
+def is_candidate_by_checksize(candidate, fileset, conn):
+    with conn.cursor() as cursor:
+        cursor.execute(
+            "SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE detection=1 AND fileset = %s",
+            (candidate,),
+        )
+        target_files = cursor.fetchall()
+        candidate_files = {
+            target_file["id"]: [target_file["name"], target_file["size"]]
+            for target_file in target_files
+        }
+
+        # set of (checksum, filename)
+        scan_checksums = set()
+        for file in fileset["rom"]:
+            for key in file:
+                if key.startswith("md5"):
+                    name = os.path.basename(normalised_path(file["name"]))
+                    scan_checksums.add((file[key], name.lower()))
+
+        for detection_file_id, [
+            detection_file_name,
+            detection_file_size,
+        ] in candidate_files.items():
+            query = """
+                        SELECT fc.checksum, fc.checksize, fc.checktype
+                        FROM filechecksum fc
+                        WHERE fc.file = %s
+                    """
+            cursor.execute(query, (detection_file_id,))
+            checksums_info = cursor.fetchall()
+            if checksums_info:
+                for checksum_info in checksums_info:
+                    checksum = checksum_info["checksum"]
+                    checksize = checksum_info["checksize"]
+                    if checksize == "1M":
+                        checksize = 1048576
+                    if (
+                        (
+                            checksum,
+                            os.path.basename(detection_file_name.lower()),
+                        )
+                        not in scan_checksums
+                        and detection_file_size <= int(checksize)
+                        and detection_file_size != -1
+                    ):
+                        continue
+                    else:
+                        return True
+        return False
+
+
 def process_fileset(
     fileset,
     resources,
@@ -2972,6 +3027,11 @@ def console_log_candidate_filtering(fileset_count):
     sys.stdout.flush()
 
 
+def console_log_file_update(fileset_count):
+    sys.stdout.write(f"Updating files - Fileset {fileset_count}\r")
+    sys.stdout.flush()
+
+
 def console_log_matching(fileset_count):
     sys.stdout.write(f"Performing Match - Fileset {fileset_count}\r")
     sys.stdout.flush()

From ca9d4a7a2a9330373e585d2948ed63405128964d Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Thu, 17 Jul 2025 17:29:48 +0530
Subject: [PATCH 25/30] INTEGRITY: Remove 'obsolete' fileset status entirely.

---
 db_functions.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 3a4fc12..c4e65a2 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -146,9 +146,6 @@ def insert_fileset(
             cursor.execute(
                 f"UPDATE fileset SET `timestamp` = FROM_UNIXTIME(@fileset_time_last) WHERE id = {existing_entry}"
             )
-            cursor.execute(
-                f"UPDATE fileset SET status = 'detection' WHERE id = {existing_entry} AND status = 'obsolete'"
-            )
             cursor.execute(f"SELECT status FROM fileset WHERE id = {existing_entry}")
             status = cursor.fetchone()["status"]
         if status == "user":
@@ -610,10 +607,6 @@ def db_insert(data_arr, username=None, skiplog=False):
 
         fileset_count += 1
 
-    if detection:
-        conn.cursor().execute(
-            "UPDATE fileset SET status = 'obsolete' WHERE `timestamp` != FROM_UNIXTIME(@fileset_time_last) AND status = 'detection'"
-        )
     cur = conn.cursor()
 
     try:

From a30364586988c4df01427fa16b2942b57be02981 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Thu, 17 Jul 2025 17:31:57 +0530
Subject: [PATCH 26/30] INTEGRITY: Add checksum based filtering before
 filtering by maximum number of files matched.

---
 db_functions.py | 155 +++++++++++++++++++++++++++++-------------------
 1 file changed, 93 insertions(+), 62 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index c4e65a2..20a3240 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -2130,15 +2130,17 @@ def set_filter_candidate_filesets(
     fileset_id, fileset, fileset_count, transaction_id, conn
 ):
     """
-    Returns a list of candidate filesets that can be merged
+    Returns a list of candidate filesets that can be merged.
+    Performs early filtering in SQL (by engine, name, size) and then
+    applies checksum filtering and max-match filtering in Python.
     """
     with conn.cursor() as cursor:
-        # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches
         fileset_count += 1
         console_log_candidate_filtering(fileset_count)
+
+        # Early filter candidates using enginename, filename and size
         query = """
-            WITH candidate_fileset AS ( 
-            SELECT fs.id AS fileset_id, f.name, f.size
+            SELECT fs.id AS fileset_id, f.id AS file_id, f.name, f.size
             FROM file f
             JOIN fileset fs ON f.fileset = fs.id
             JOIN game g ON g.id = fs.game
@@ -2147,65 +2149,94 @@ def set_filter_candidate_filesets(
             WHERE e.engineid = %s
             AND f.detection = 1
             AND t.transaction != %s
-            ),
-            total_detection_files AS (
-            SELECT cf.fileset_id, COUNT(*) AS detection_files_found
-            FROM candidate_fileset cf
-            GROUP BY fileset_id
-            ),
-            set_fileset AS (
-            SELECT name, size FROM file
-            WHERE fileset = %s
-            ),
-            matched_detection_files AS (
-            SELECT cf.fileset_id, COUNT(*) AS match_files_count
-            FROM candidate_fileset cf
-            JOIN set_fileset sf ON ( (
-                cf.name = sf.name
-                OR
-                REGEXP_REPLACE(cf.name, '^.*[\\\\/]', '') = REGEXP_REPLACE(sf.name, '^.*[\\\\/]', '')
-            ) AND (cf.size = sf.size OR cf.size = -1) )
-            GROUP BY cf.fileset_id
-            ),
-            valid_matched_detection_files AS (
-            SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
-            FROM matched_detection_files mdf
-            JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
-            WHERE tdf.detection_files_found <= mdf.match_files_count
-            ),
-            max_match_count AS (
-                SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
-            )
-            SELECT vmdf.fileset_id
-            FROM valid_matched_detection_files vmdf
-            JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
-            JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
         """
-
-        cursor.execute(query, (fileset["sourcefile"], transaction_id, fileset_id))
-        rows = cursor.fetchall()
-
-        candidates = []
-        if rows:
-            for row in rows:
-                candidates.append(row["fileset_id"])
-
-        matched_candidates = []
-
-        candidates = [
-            candidate
-            for candidate in candidates
-            if is_candidate_by_checksize(candidate, fileset, conn)
-        ]
-
-        for candidate in candidates:
-            if is_full_detection_checksum_match(candidate, fileset, conn):
-                matched_candidates.append(candidate)
-
-        if len(matched_candidates) != 0:
-            candidates = matched_candidates
-
-        return (candidates, fileset_count)
+        cursor.execute(query, (fileset["sourcefile"], transaction_id))
+        raw_candidates = cursor.fetchall()
+
+    # fileset id to detection files map
+    candidate_map = defaultdict(list)
+    total_detection_files_map = defaultdict(int)
+    for row in raw_candidates:
+        candidate_map[row["fileset_id"]].append(
+            {
+                "file_id": row["file_id"],
+                "name": row["name"],
+                "size": row["size"],
+            }
+        )
+    for id, files in candidate_map.items():
+        total_detection_files_map[id] = len(files)
+
+    set_checksums = set()
+    set_file_name_size = set()
+    for file in fileset["rom"]:
+        for key in file:
+            if key.startswith("md5"):
+                name = os.path.basename(normalised_path(file["name"]))
+                set_checksums.add((file[key], name.lower(), int(file["size"])))
+                set_checksums.add((file[key], name.lower(), -1))
+        set_file_name_size.add((name.lower(), -1))
+        set_file_name_size.add((name.lower(), int(file["size"])))
+
+    # Filter candidates by detection filename and file size (including -1) and increase matched file count
+    # if filesize = -1,
+    # elif filesize <= checksize and checksum matches,
+    # elif filesize > checksize.
+    match_counts = {}
+    for fileset_id, files in candidate_map.items():
+        count = 0
+        with conn.cursor() as cursor:
+            for f in files:
+                filename = os.path.basename(f["name"]).lower()
+                filesize = f["size"]
+                if (filename, filesize) in set_file_name_size:
+                    if filesize == -1:
+                        count += 1
+                    else:
+                        cursor.execute(
+                            """
+                            SELECT checksum, checksize, checktype
+                            FROM filechecksum
+                            WHERE file = %s
+                        """,
+                            (f["file_id"],),
+                        )
+                        checksums = cursor.fetchall()
+                        not_inc_count = False
+                        for c in checksums:
+                            checksum = c["checksum"]
+                            checksize = c["checksize"]
+                            if checksize == "1M":
+                                checksize = 1048576
+                            elif checksize == "0":
+                                checksize = filesize
+                            if filesize <= int(checksize):
+                                if (checksum, filename, filesize) in set_checksums:
+                                    count += 1
+                                not_inc_count = True
+                                # if it was a true match, checksum should be present
+                                break
+                        if not not_inc_count:
+                            count += 1
+        if count > 0 and total_detection_files_map[fileset_id] <= count:
+            match_counts[fileset_id] = count
+
+    # Filter only entries with maximum number of matched files
+    if not match_counts:
+        return ([], fileset_count)
+
+    max_match = max(match_counts.values())
+    candidates = [fid for fid, count in match_counts.items() if count == max_match]
+
+    matched_candidates = []
+    for candidate in candidates:
+        if is_full_detection_checksum_match(candidate, fileset, conn):
+            matched_candidates.append(candidate)
+
+    if len(matched_candidates) != 0:
+        candidates = matched_candidates
+
+    return (candidates, fileset_count)
 
 
 def is_candidate_by_checksize(candidate, fileset, conn):

From 4dd7e297c842b9b399ca6b44bbdc56738877f003 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Thu, 17 Jul 2025 19:47:06 +0530
Subject: [PATCH 27/30] INTEGRITY: Merge one of the entries from dropped
 duplicate entries. Drop others.

---
 db_functions.py | 50 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 20a3240..f9e9fe0 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -1693,7 +1693,7 @@ def set_process(
     console_message = "Looking for duplicates..."
     console_log(console_message)
 
-    # Remove all such filesets, which have many to one mapping with a single candidate, those are extra variants.
+    # Remove all such filesets, which have many to one mapping with a single candidate, just merge one of them.
     value_to_keys = defaultdict(list)
     for set_fileset, candidates in set_to_candidate_dict.items():
         if len(candidates) == 1:
@@ -1717,7 +1717,12 @@ def set_process(
             platform = result["platform"]
             language = result["language"]
 
+            # Skip the first entry, let it merge and drop others
+            skip = True
             for set_fileset in set_filesets:
+                if skip:
+                    skip = False
+                    continue
                 fileset = id_to_fileset_dict[set_fileset]
                 category_text = "Drop fileset - Duplicates"
                 fileset_name = fileset["name"] if "name" in fileset else ""
@@ -1742,9 +1747,9 @@ def set_process(
         fileset = id_to_fileset_dict[fileset_id]
 
         # Filter by platform to reduce manual merge
-        candidate_filesets = set_filter_by_platform(
-            fileset["name"], candidate_filesets, conn
-        )
+        # candidate_filesets = set_filter_by_platform(
+        #     fileset["name"], candidate_filesets, conn
+        # )
 
         (
             fully_matched_filesets,
@@ -1771,16 +1776,35 @@ def set_process(
         match_count += 1
     console_log("Matching performed.")
 
-    for fileset_id, candidates in manual_merge_map.items():
-        category_text = "Manual Merge Required"
-        log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidates)}."
-        manual_merged_filesets += 1
-        add_manual_merge(
-            candidates, fileset_id, category_text, log_text, user, conn, log_text
-        )
-
-    # Final log
     with conn.cursor() as cursor:
+        for fileset_id, candidates in manual_merge_map.items():
+            if len(candidates) == 0:
+                category_text = "Drop fileset - No Candidates"
+                fileset = id_to_fileset_dict[fileset_id]
+                fileset_name = fileset["name"] if "name" in fileset else ""
+                fileset_description = (
+                    fileset["description"] if "description" in fileset else ""
+                )
+                log_text = f"Drop fileset as no matching candidates. Name: {fileset_name}, Description: {fileset_description}."
+                create_log(
+                    escape_string(category_text), user, escape_string(log_text), conn
+                )
+                dropped_early_no_candidate += 1
+                delete_original_fileset(fileset_id, conn)
+            else:
+                category_text = "Manual Merge Required"
+                log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidates)}."
+                manual_merged_filesets += 1
+                add_manual_merge(
+                    candidates,
+                    fileset_id,
+                    category_text,
+                    log_text,
+                    user,
+                    conn,
+                    log_text,
+                )
+
         cursor.execute(
             "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s",
             (transaction_id,),

From 57df340d9f267dc2e88649d4c377fede6f589e87 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Thu, 17 Jul 2025 20:42:51 +0530
Subject: [PATCH 28/30] INTEGRITY: Merge filtering logic for glk with existing
 set.dat filtering.

---
 db_functions.py | 119 ++++++++++++------------------------------------
 1 file changed, 28 insertions(+), 91 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index f9e9fe0..897d346 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -1659,14 +1659,9 @@ def set_process(
         # Separating out the matching logic for glk engine
         engine_name = fileset["sourcefile"].split("-")[0]
 
-        if engine_name == "glk":
-            (candidate_filesets, fileset_count) = set_glk_filter_candidate_filesets(
-                fileset_id, fileset, fileset_count, transaction_id, engine_name, conn
-            )
-        else:
-            (candidate_filesets, fileset_count) = set_filter_candidate_filesets(
-                fileset_id, fileset, fileset_count, transaction_id, conn
-            )
+        (candidate_filesets, fileset_count) = set_filter_candidate_filesets(
+            fileset_id, fileset, fileset_count, transaction_id, engine_name, conn
+        )
 
         # Mac files in set.dat are not represented properly and they won't find a candidate fileset for a match, so we can drop them.
         if len(candidate_filesets) == 0:
@@ -2071,93 +2066,16 @@ def is_full_checksum_match(candidate_fileset, fileset, conn):
         return (len(unmatched_files) == 0, unmatched_files)
 
 
-def set_glk_filter_candidate_filesets(
-    fileset_id, fileset, fileset_count, transaction_id, engine_name, conn
-):
-    """
-    Returns a list of candidate filesets for glk engines that can be merged
-    """
-    with conn.cursor() as cursor:
-        # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches
-        fileset_count += 1
-        console_log_candidate_filtering(fileset_count)
-        query = """
-            WITH candidate_fileset AS ( 
-            SELECT fs.id AS fileset_id, f.size
-            FROM file f
-            JOIN fileset fs ON f.fileset = fs.id
-            JOIN game g ON g.id = fs.game
-            JOIN engine e ON e.id = g.engine
-            JOIN transactions t ON t.fileset = fs.id
-            WHERE fs.id != %s
-            AND e.engineid = %s
-            AND f.detection = 1
-            AND t.transaction != %s
-            AND (g.gameid = %s OR (g.gameid != %s AND g.gameid LIKE %s))
-            ),
-            total_detection_files AS (
-            SELECT cf.fileset_id, COUNT(*) AS detection_files_found
-            FROM candidate_fileset cf
-            GROUP BY fileset_id
-            ),
-            set_fileset AS (
-            SELECT size FROM file
-            WHERE fileset = %s
-            ),
-            matched_detection_files AS (
-            SELECT cf.fileset_id, COUNT(*) AS match_files_count
-            FROM candidate_fileset cf
-            JOIN set_fileset sf ON
-            cf.size = sf.size OR cf.size = 0
-            GROUP BY cf.fileset_id
-            ),
-            valid_matched_detection_files AS (
-            SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
-            FROM matched_detection_files mdf
-            JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
-            WHERE tdf.detection_files_found <= mdf.match_files_count
-            ),
-            max_match_count AS (
-                SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
-            )
-            SELECT vmdf.fileset_id
-            FROM valid_matched_detection_files vmdf
-            JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
-            JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
-        """
-
-        gameid_pattern = f"%{fileset['name']}%"
-
-        cursor.execute(
-            query,
-            (
-                fileset_id,
-                engine_name,
-                transaction_id,
-                fileset["name"],
-                fileset["name"],
-                gameid_pattern,
-                fileset_id,
-            ),
-        )
-        rows = cursor.fetchall()
-
-        candidates = []
-        if rows:
-            for row in rows:
-                candidates.append(row["fileset_id"])
-
-        return (candidates, fileset_count)
-
-
 def set_filter_candidate_filesets(
-    fileset_id, fileset, fileset_count, transaction_id, conn
+    fileset_id, fileset, fileset_count, transaction_id, engine_name, conn
 ):
     """
     Returns a list of candidate filesets that can be merged.
     Performs early filtering in SQL (by engine, name, size) and then
     applies checksum filtering and max-match filtering in Python.
+    In case of glk engines, filtering is not by name, rather gameid is used.
     """
+    is_glk = engine_name == "glk"
     with conn.cursor() as cursor:
         fileset_count += 1
         console_log_candidate_filtering(fileset_count)
@@ -2174,7 +2092,21 @@ def set_filter_candidate_filesets(
             AND f.detection = 1
             AND t.transaction != %s
         """
-        cursor.execute(query, (fileset["sourcefile"], transaction_id))
+        if is_glk:
+            query += " AND (g.gameid = %s OR (g.gameid != %s AND g.gameid LIKE %s))"
+            gameid_pattern = f"%{fileset['name']}%"
+            cursor.execute(
+                query,
+                (
+                    engine_name,
+                    transaction_id,
+                    fileset["name"],
+                    fileset["name"],
+                    gameid_pattern,
+                ),
+            )
+        else:
+            cursor.execute(query, (fileset["sourcefile"], transaction_id))
         raw_candidates = cursor.fetchall()
 
     # fileset id to detection files map
@@ -2184,7 +2116,7 @@ def set_filter_candidate_filesets(
         candidate_map[row["fileset_id"]].append(
             {
                 "file_id": row["file_id"],
-                "name": row["name"],
+                "name": os.path.basename(normalised_path(row["name"])).lower(),
                 "size": row["size"],
             }
         )
@@ -2193,14 +2125,17 @@ def set_filter_candidate_filesets(
 
     set_checksums = set()
     set_file_name_size = set()
+    set_glk_file_size = set()
     for file in fileset["rom"]:
+        name = os.path.basename(normalised_path(file["name"]))
         for key in file:
             if key.startswith("md5"):
-                name = os.path.basename(normalised_path(file["name"]))
                 set_checksums.add((file[key], name.lower(), int(file["size"])))
                 set_checksums.add((file[key], name.lower(), -1))
         set_file_name_size.add((name.lower(), -1))
         set_file_name_size.add((name.lower(), int(file["size"])))
+        if is_glk:
+            set_glk_file_size.add(int(file["size"]))
 
     # Filter candidates by detection filename and file size (including -1) and increase matched file count
     # if filesize = -1,
@@ -2213,6 +2148,8 @@ def set_filter_candidate_filesets(
             for f in files:
                 filename = os.path.basename(f["name"]).lower()
                 filesize = f["size"]
+                if is_glk and (filesize in set_glk_file_size or filesize == 0):
+                    count += 1
                 if (filename, filesize) in set_file_name_size:
                     if filesize == -1:
                         count += 1

From c9dca049ba2f3ef18820ce778bf0610fa1b35c02 Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Fri, 18 Jul 2025 00:40:57 +0530
Subject: [PATCH 29/30] INTEGRITY: Add checksum filtering before max files
 filtering in scan.dat processing

---
 db_functions.py | 183 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 130 insertions(+), 53 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 897d346..2ae4778 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -1059,6 +1059,7 @@ def scan_process(
         create_log(escape_string(category_text), user, escape_string(log_text), conn)
         category_text = "Upload information"
         log_text = f"Number of filesets: {fileset_insertion_count}. Filesets automatically merged: {automatic_merged_filesets}. Filesets requiring manual merge (multiple candidates): {manual_merged_filesets}. Filesets requiring manual merge (matched with detection): {manual_merged_with_detection}. Filesets dropped, no candidate: {dropped_early_no_candidate}. Filesets matched with existing Full fileset: {match_with_full_fileset}. Filesets with mismatched files with Full fileset: {mismatch_with_full_fileset}. Filesets missing files compared to partial fileset candidate: {filesets_with_missing_files}."
+        console_log(log_text)
         create_log(escape_string(category_text), user, escape_string(log_text), conn)
 
 
@@ -1115,9 +1116,12 @@ def pre_update_files(rom, filesets_check_for_full, transaction_id, conn):
                 SET size = %s,
                 `size-r` = %s,
                 `size-rd` = %s,
+                name = %s
                 WHERE id = %s
             """
-            cursor.execute(query, size, size_r, size_rd, file_id)
+            cursor.execute(
+                query, (size, size_r, size_rd, normalised_path(rom["name"]), file_id)
+            )
 
 
 def scan_perform_match(
@@ -1396,73 +1400,146 @@ def total_fileset_files(fileset):
 
 def scan_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn):
     """
-    Returns a list of candidate filesets that can be merged
+    Returns a list of candidate filesets that can be merged.
+    Performs early filtering in SQL (by name, size) and then
+    applies checksum filtering and max-match filtering in Python.
     """
     with conn.cursor() as cursor:
-        # Returns those filesets which have all detection files matching in the scan fileset filtered by file name and file size(if not -1).
-
+        # Fetching detection filename and all sizes (size, size-r, size-rd) from database
         query = """
-            WITH candidate_fileset AS (
-            SELECT fs.id AS fileset_id, f.name, f.size,
+            SELECT fs.id AS fileset_id, f.id as file_id, f.name, f.size,
             f.`size-r` AS size_r, f.`size-rd` AS size_rd
             FROM file f
             JOIN fileset fs ON f.fileset = fs.id
             JOIN game g ON g.id = fs.game
+            JOIN engine e ON e.id = g.engine
             JOIN transactions t ON t.fileset = fs.id
             WHERE f.detection = 1
             AND t.transaction != %s
-            ),
-            total_detection_files AS (
-            SELECT cf.fileset_id, COUNT(*) AS detection_files_found
-            FROM candidate_fileset cf
-            GROUP BY fileset_id
-            ),
-            set_fileset AS (
-            SELECT name, size,
-            `size-r` AS size_r, `size-rd` AS size_rd
-            FROM file
-            WHERE fileset = %s
-            ),
-            matched_detection_files AS (
-            SELECT cf.fileset_id, COUNT(*) AS match_files_count
-            FROM candidate_fileset cf
-            JOIN set_fileset sf ON ( (
-                cf.name = sf.name
-                OR
-                REGEXP_REPLACE(cf.name, '^.*[\\\\/]', '') = REGEXP_REPLACE(sf.name, '^.*[\\\\/]', '')
-            ) AND (cf.size = sf.size OR cf.size = -1)
-            AND (cf.size_r = sf.size_r)
-            AND (cf.size_rd = sf.size_rd))
-            GROUP BY cf.fileset_id
-            ),
-            valid_matched_detection_files AS (
-            SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
-            FROM matched_detection_files mdf
-            JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
-            WHERE tdf.detection_files_found <= mdf.match_files_count
-            ),
-            max_match_count AS (
-                SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
-            )
-            SELECT vmdf.fileset_id
-            FROM valid_matched_detection_files vmdf
-            JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
-            JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
         """
+        cursor.execute(query, (transaction_id,))
+        raw_candidates = cursor.fetchall()
+
+    # fileset id to detection files map
+    candidate_map = defaultdict(list)
+    total_detection_files_map = defaultdict(int)
+    for row in raw_candidates:
+        candidate_map[row["fileset_id"]].append(
+            {
+                "file_id": row["file_id"],
+                "name": os.path.basename(normalised_path(row["name"])).lower(),
+                "size": row["size"],
+                "size-r": row["size_r"],
+                "size-rd": row["size_rd"],
+            }
+        )
+    for id, files in candidate_map.items():
+        total_detection_files_map[id] = len(files)
+
+    set_checksums = set()
+    set_file_name_size = set()
+    for file in fileset["rom"]:
+        name = os.path.basename(normalised_path(file["name"]))
+        for key in file:
+            if key.startswith("md5"):
+                set_checksums.add(
+                    (
+                        file[key],
+                        name.lower(),
+                        int(file["size"]),
+                        int(file["size-r"]),
+                        int(file["size-rd"]),
+                    )
+                )
+                set_checksums.add(
+                    (
+                        file[key],
+                        name.lower(),
+                        -1,
+                        int(file["size-r"]),
+                        int(file["size-rd"]),
+                    )
+                )
+        set_file_name_size.add(
+            (name.lower(), -1, int(file["size-r"]), int(file["size-rd"]))
+        )
+        set_file_name_size.add(
+            (name.lower(), int(file["size"]), int(file["size-r"]), int(file["size-rd"]))
+        )
+
+    # Filter candidates by detection filename and file size (including -1) and increase matched file count
+    # if filesize = -1,
+    # elif filesize <= checksize and checksum matches,
+    # elif filesize > checksize.
+    match_counts = {}
+    for fileset_id, files in candidate_map.items():
+        count = 0
+        with conn.cursor() as cursor:
+            for f in files:
+                filename = os.path.basename(f["name"]).lower()
+                size = f["size"]
+                size_r = f["size-r"]
+                size_rd = f["size-rd"]
+                if (filename, size, size_r, size_rd) in set_file_name_size:
+                    if size == -1:
+                        count += 1
+                    else:
+                        cursor.execute(
+                            """
+                            SELECT checksum, checksize, checktype
+                            FROM filechecksum
+                            WHERE file = %s
+                        """,
+                            (f["file_id"],),
+                        )
+                        checksums = cursor.fetchall()
+                        not_inc_count = False
+                        for c in checksums:
+                            filesize = size
+                            checksum = c["checksum"]
+                            checksize = c["checksize"]
+                            checktype = c["checktype"]
+                            # Macfiles handling
+                            if checktype in ["md5-r", "md5-rt"]:
+                                filesize = size_rd
 
-        cursor.execute(query, (transaction_id, fileset_id))
-        rows = cursor.fetchall()
+                            if checksize == "1M":
+                                checksize = 1048576
+                            elif checksize == "0":
+                                checksize = filesize
+                            if filesize <= int(checksize):
+                                if (
+                                    checksum,
+                                    filename,
+                                    size,
+                                    size_r,
+                                    size_rd,
+                                ) in set_checksums:
+                                    count += 1
+                                not_inc_count = True
+                                # if it was a true match, checksum should be present
+                                break
+                        if not not_inc_count:
+                            count += 1
+        if count > 0 and total_detection_files_map[fileset_id] <= count:
+            match_counts[fileset_id] = count
+
+    # Filter only entries with maximum number of matched files
+    if not match_counts:
+        return []
 
-        candidates = []
-        if rows:
-            for row in rows:
-                candidates.append(row["fileset_id"])
+    max_match = max(match_counts.values())
+    candidates = [fid for fid, count in match_counts.items() if count == max_match]
 
-        for candidate in candidates:
-            if not is_full_detection_checksum_match(candidate, fileset, conn):
-                candidates.remove(candidate)
+    matched_candidates = []
+    for candidate in candidates:
+        if is_full_detection_checksum_match(candidate, fileset, conn):
+            matched_candidates.append(candidate)
+
+    if len(matched_candidates) != 0:
+        candidates = matched_candidates
 
-        return candidates
+    return candidates
 
 
 def get_unmatched_files(candidate_fileset, fileset, conn):

From 5e54aa690e6c6c6672902b63277a12eed9054b8b Mon Sep 17 00:00:00 2001
From: ShivangNagta <shivangnag@gmail.com>
Date: Fri, 18 Jul 2025 02:28:22 +0530
Subject: [PATCH 30/30] INTEGRITY: Parameterising all sql queries in
 db_functions.py

---
 db_functions.py | 187 +++++++++++++++++++++++++++---------------------
 1 file changed, 104 insertions(+), 83 deletions(-)

diff --git a/db_functions.py b/db_functions.py
index 2ae4778..490bceb 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -79,24 +79,26 @@ def insert_game(engine_name, engineid, title, gameid, extra, platform, lang, con
     # Set @engine_last if engine already present in table
     exists = False
     with conn.cursor() as cursor:
-        cursor.execute(f"SELECT id FROM engine WHERE engineid = '{engineid}'")
+        cursor.execute("SELECT id FROM engine WHERE engineid = %s", (engineid,))
         res = cursor.fetchone()
         if res is not None:
             exists = True
-            cursor.execute(f"SET @engine_last = '{res['id']}'")
+            cursor.execute("SET @engine_last = %s", (res["id"],))
 
     # Insert into table if not present
     if not exists:
         with conn.cursor() as cursor:
             cursor.execute(
-                f"INSERT INTO engine (name, engineid) VALUES ('{escape_string(engine_name)}', '{engineid}')"
+                "INSERT INTO engine (name, engineid) VALUES (%s, %s)",
+                (engine_name, engineid),
             )
             cursor.execute("SET @engine_last = LAST_INSERT_ID()")
 
     # Insert into game
     with conn.cursor() as cursor:
         cursor.execute(
-            f"INSERT INTO game (name, engine, gameid, extra, platform, language) VALUES ('{escape_string(title)}', @engine_last, '{gameid}', '{escape_string(extra)}', '{platform}', '{lang}')"
+            "INSERT INTO game (name, engine, gameid, extra, platform, language) VALUES (%s, @engine_last, %s, %s, %s, %s)",
+            (title, gameid, extra, platform, lang),
         )
         cursor.execute("SET @game_last = LAST_INSERT_ID()")
 
@@ -129,24 +131,27 @@ def insert_fileset(
     # Check if key/megakey already exists, if so, skip insertion (no quotes on purpose)
     if detection:
         with conn.cursor() as cursor:
-            cursor.execute(f"SELECT id FROM fileset WHERE megakey = {megakey}")
+            cursor.execute("SELECT id FROM fileset WHERE megakey = %s", (megakey,))
 
             existing_entry = cursor.fetchone()
     else:
         with conn.cursor() as cursor:
-            cursor.execute(f"SELECT id FROM fileset WHERE `key` = {key}")
+            cursor.execute("SELECT id FROM fileset WHERE `key` = %s", (key,))
 
             existing_entry = cursor.fetchone()
 
     if existing_entry is not None:
         existing_entry = existing_entry["id"]
         with conn.cursor() as cursor:
-            cursor.execute(f"SET @fileset_last = {existing_entry}")
-            cursor.execute(f"DELETE FROM file WHERE fileset = {existing_entry}")
+            cursor.execute("SET @fileset_last = %s", (existing_entry,))
+            cursor.execute("DELETE FROM file WHERE fileset = %s", (existing_entry,))
             cursor.execute(
-                f"UPDATE fileset SET `timestamp` = FROM_UNIXTIME(@fileset_time_last) WHERE id = {existing_entry}"
+                "UPDATE fileset SET `timestamp` = FROM_UNIXTIME(@fileset_time_last) WHERE id = %s",
+                (existing_entry,),
+            )
+            cursor.execute(
+                "SELECT status FROM fileset WHERE id = %s", (existing_entry,)
             )
-            cursor.execute(f"SELECT status FROM fileset WHERE id = {existing_entry}")
             status = cursor.fetchone()["status"]
         if status == "user":
             add_usercount(existing_entry, conn)
@@ -162,10 +167,10 @@ def insert_fileset(
         return (existing_entry, True)
 
     # $game and $key should not be parsed as a mysql string, hence no quotes
-    query = f"INSERT INTO fileset (game, status, src, `key`, megakey, `timestamp`, set_dat_metadata) VALUES ({game}, '{status}', '{src}', {key}, {megakey}, FROM_UNIXTIME(@fileset_time_last), '{escape_string(set_dat_metadata)}')"
+    query = "INSERT INTO fileset (game, status, src, `key`, megakey, `timestamp`, set_dat_metadata) VALUES (%s, %s, %s, %s, %s, FROM_UNIXTIME(@fileset_time_last), %s)"
     fileset_id = -1
     with conn.cursor() as cursor:
-        cursor.execute(query)
+        cursor.execute(query, (game, status, src, key, megakey, set_dat_metadata))
         fileset_id = cursor.lastrowid
         cursor.execute("SET @fileset_last = LAST_INSERT_ID()")
 
@@ -188,7 +193,8 @@ def insert_fileset(
         update_history(0, fileset_last, conn)
     with conn.cursor() as cursor:
         cursor.execute(
-            f"INSERT INTO transactions (`transaction`, fileset) VALUES ({transaction}, {fileset_last})"
+            "INSERT INTO transactions (`transaction`, fileset) VALUES (%s, %s)",
+            (transaction, fileset_last),
         )
 
     return (fileset_id, False)
@@ -230,17 +236,11 @@ def insert_file(file, detection, src, conn):
     values.append(file["size"] if "size" in file else "0")
     values.append(file["size-r"] if "size-r" in file else "0")
     values.append(file["size-rd"] if "size-rd" in file else "0")
-
-    modification_time = file["modification-time"] if "modification-time" in file else ""
-    values.append(modification_time)
-
+    values.append(file["modification-time"] if "modification-time" in file else "")
     values.extend([checksum, detection, detection_type])
 
     # Parameterised Query
-    placeholders = (
-        ["%s"] * (len(values[:6])) + ["@fileset_last"] + ["%s"] * 2 + ["NOW()"]
-    )
-    query = f"INSERT INTO file ( name, size, `size-r`, `size-rd`, `modification-time`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ({', '.join(placeholders)})"
+    query = "INSERT INTO file ( name, size, `size-r`, `size-rd`, `modification-time`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES (%s, %s, %s, %s, %s, %s, @fileset_last, %s, %s, NOW())"
 
     with conn.cursor() as cursor:
         cursor.execute(query, values)
@@ -248,7 +248,8 @@ def insert_file(file, detection, src, conn):
     if detection:
         with conn.cursor() as cursor:
             cursor.execute(
-                f"UPDATE fileset SET detection_size = {checksize} WHERE id = @fileset_last AND detection_size IS NULL"
+                "UPDATE fileset SET detection_size = %s WHERE id = @fileset_last AND detection_size IS NULL",
+                (checksize,),
             )
     with conn.cursor() as cursor:
         cursor.execute("SET @file_last = LAST_INSERT_ID()")
@@ -279,7 +280,7 @@ def add_all_equal_checksums(checksize, checktype, checksum, file_id, conn):
         if checktype[-1] == "r":
             size_name += "-rd"
 
-        cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = {file_id}")
+        cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = %s", (file_id,))
         result = cursor.fetchone()
         if not result:
             return
@@ -375,9 +376,10 @@ def punycode_need_encode(orig):
 
 def create_log(category, user, text, conn):
     query = f"INSERT INTO log (`timestamp`, category, user, `text`) VALUES (FROM_UNIXTIME({int(time.time())}), '{escape_string(category)}', '{escape_string(user)}', '{escape_string(text)}')"
+    query = "INSERT INTO log (`timestamp`, category, user, `text`) VALUES (FROM_UNIXTIME(%s), %s, %s, %s)"
     with conn.cursor() as cursor:
         try:
-            cursor.execute(query)
+            cursor.execute(query, (int(time.time()), category, user, text))
             conn.commit()
         except Exception as e:
             conn.rollback()
@@ -390,10 +392,12 @@ def create_log(category, user, text, conn):
 
 
 def update_history(source_id, target_id, conn, log_last=None):
-    query = f"INSERT INTO history (`timestamp`, fileset, oldfileset, log) VALUES (NOW(), {target_id}, {source_id}, {log_last if log_last is not None else 0})"
+    query = "INSERT INTO history (`timestamp`, fileset, oldfileset, log) VALUES (NOW(), %s, %s, %s)"
     with conn.cursor() as cursor:
         try:
-            cursor.execute(query)
+            cursor.execute(
+                query, (target_id, source_id, log_last if log_last is not None else 0)
+            )
             conn.commit()
         except Exception as e:
             conn.rollback()
@@ -418,7 +422,8 @@ def get_all_related_filesets(fileset_id, conn, visited=None):
     try:
         with conn.cursor() as cursor:
             cursor.execute(
-                f"SELECT fileset, oldfileset FROM history WHERE fileset = {fileset_id} OR oldfileset = {fileset_id}"
+                "SELECT fileset, oldfileset FROM history WHERE fileset = %s OR oldfileset = %s",
+                (fileset_id, fileset_id),
             )
             history_records = cursor.fetchall()
 
@@ -516,7 +521,7 @@ def db_insert(data_arr, username=None, skiplog=False):
     detection = src == "scummvm"
     status = "detection" if detection else src
 
-    conn.cursor().execute(f"SET @fileset_time_last = {int(time.time())}")
+    conn.cursor().execute("SET @fileset_time_last = %s", (int(time.time()),))
 
     with conn.cursor() as cursor:
         cursor.execute("SELECT MAX(`transaction`) FROM transactions")
@@ -611,7 +616,8 @@ def db_insert(data_arr, username=None, skiplog=False):
 
     try:
         cur.execute(
-            f"SELECT COUNT(fileset) from transactions WHERE `transaction` = {transaction_id}"
+            "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s",
+            (transaction_id,),
         )
         fileset_insertion_count = cur.fetchone()["COUNT(fileset)"]
         category_text = f"Uploaded from {src}"
@@ -627,11 +633,13 @@ def db_insert(data_arr, username=None, skiplog=False):
 def compare_filesets(id1, id2, conn):
     with conn.cursor() as cursor:
         cursor.execute(
-            f"SELECT name, size, `size-r`, `size-rd`, checksum FROM file WHERE fileset = '{id1}'"
+            "SELECT name, size, `size-r`, `size-rd`, checksum FROM file WHERE fileset = %s",
+            (id1,),
         )
         fileset1 = cursor.fetchall()
         cursor.execute(
-            f"SELECT name, size, `size-r`, `size-rd`, checksum FROM file WHERE fileset = '{id2}'"
+            "SELECT name, size, `size-r`, `size-rd`, checksum FROM file WHERE fileset = %s",
+            (id2,),
         )
         fileset2 = cursor.fetchall()
 
@@ -665,9 +673,9 @@ def find_matching_game(game_files):
     for file in game_files:
         checksum = file[1]
 
-        query = f"SELECT file.fileset as file_fileset FROM filechecksum JOIN file ON filechecksum.file = file.id WHERE filechecksum.checksum = '{checksum}' AND file.detection = TRUE"
+        query = "SELECT file.fileset as file_fileset FROM filechecksum JOIN file ON filechecksum.file = file.id WHERE filechecksum.checksum = %s AND file.detection = TRUE"
         with conn.cursor() as cursor:
-            cursor.execute(query)
+            cursor.execute(query, (checksum,))
             records = cursor.fetchall()
 
         # If file is not part of detection entries, skip it
@@ -682,7 +690,8 @@ def find_matching_game(game_files):
     for key, value in Counter(matching_filesets).items():
         with conn.cursor() as cursor:
             cursor.execute(
-                f"SELECT COUNT(file.id) FROM file JOIN fileset ON file.fileset = fileset.id WHERE fileset.id = '{key}'"
+                "SELECT COUNT(file.id) FROM file JOIN fileset ON file.fileset = fileset.id WHERE fileset.id = %s",
+                (key,),
             )
             count_files_in_fileset = cursor.fetchone()["COUNT(file.id)"]
 
@@ -693,7 +702,8 @@ def find_matching_game(game_files):
 
         with conn.cursor() as cursor:
             cursor.execute(
-                f"SELECT engineid, game.id, gameid, platform, language, `key`, src, fileset.id as fileset FROM game JOIN fileset ON fileset.game = game.id JOIN engine ON engine.id = game.engine WHERE fileset.id = '{key}'"
+                "SELECT engineid, game.id, gameid, platform, language, `key`, src, fileset.id as fileset FROM game JOIN fileset ON fileset.game = game.id JOIN engine ON engine.id = game.engine WHERE fileset.id = %s",
+                (key,),
             )
             records = cursor.fetchall()
 
@@ -717,7 +727,7 @@ def find_matching_game(game_files):
     if compare_filesets(matching_games[0]["fileset"], game_files[0][0], conn):
         with conn.cursor() as cursor:
             cursor.execute(
-                f"UPDATE fileset SET `delete` = TRUE WHERE id = {game_files[0][0]}"
+                "UPDATE fileset SET `delete` = TRUE WHERE id = %s", (game_files[0][0],)
             )
         return []
 
@@ -730,7 +740,8 @@ def merge_filesets(detection_id, dat_id):
     try:
         with conn.cursor() as cursor:
             cursor.execute(
-                f"SELECT DISTINCT(filechecksum.checksum), checksize, checktype FROM filechecksum JOIN file on file.id = filechecksum.file WHERE fileset = '{detection_id}'"
+                "SELECT DISTINCT(filechecksum.checksum), checksize, checktype FROM filechecksum JOIN file on file.id = filechecksum.file WHERE fileset = %s'",
+                (detection_id,),
             )
             detection_files = cursor.fetchall()
 
@@ -740,22 +751,26 @@ def merge_filesets(detection_id, dat_id):
                 checktype = file[2]
 
                 cursor.execute(
-                    f"DELETE FROM file WHERE checksum = '{checksum}' AND fileset = {detection_id} LIMIT 1"
+                    "DELETE FROM file WHERE checksum = %s AND fileset = %s LIMIT 1",
+                    (checksum, detection_id),
                 )
                 cursor.execute(
-                    f"UPDATE file JOIN filechecksum ON filechecksum.file = file.id SET detection = TRUE, checksize = {checksize}, checktype = '{checktype}' WHERE fileset = '{dat_id}' AND filechecksum.checksum = '{checksum}'"
+                    "UPDATE file JOIN filechecksum ON filechecksum.file = file.id SET detection = TRUE, checksize = %s, checktype = %s WHERE fileset = %s AND filechecksum.checksum = %s",
+                    (checksize, checktype, dat_id, checksum),
                 )
 
             cursor.execute(
-                f"INSERT INTO history (`timestamp`, fileset, oldfileset) VALUES (FROM_UNIXTIME({int(time.time())}), {dat_id}, {detection_id})"
+                "INSERT INTO history (`timestamp`, fileset, oldfileset) VALUES (FROM_UNIXTIME(%s), %s, %s)",
+                (int(time.time()), dat_id, detection_id),
             )
             cursor.execute("SELECT LAST_INSERT_ID()")
             history_last = cursor.fetchone()["LAST_INSERT_ID()"]
 
             cursor.execute(
-                f"UPDATE history SET fileset = {dat_id} WHERE fileset = {detection_id}"
+                "UPDATE history SET fileset = %s WHERE fileset = %s",
+                (dat_id, detection_id),
             )
-            cursor.execute(f"DELETE FROM fileset WHERE id = {detection_id}")
+            cursor.execute("DELETE FROM fileset WHERE id = %s", (detection_id,))
 
         conn.commit()
     except Exception as e:
@@ -812,11 +827,13 @@ def populate_matching_games():
         log_text = f"Matched game {matched_game['engineid']}:\n{matched_game['gameid']}-{matched_game['platform']}-{matched_game['language']}\nvariant {matched_game['key']}. State {status}. Fileset:{fileset[0][0]}."
 
         # Updating the fileset.game value to be $matched_game["id"]
-        query = f"UPDATE fileset SET game = {matched_game['id']}, status = '{status}', `key` = '{matched_game['key']}' WHERE id = {fileset[0][0]}"
+        query = "UPDATE fileset SET game = %s, status = %s, `key` = %s WHERE id = %s"
 
         history_last = merge_filesets(matched_game["fileset"], fileset[0][0])
 
-        if cursor.execute(query):
+        if cursor.execute(
+            query, (matched_game["id"], status, matched_game["key"], fileset[0][0])
+        ):
             user = f"cli:{getpass.getuser()}"
 
             create_log(
@@ -835,7 +852,7 @@ def populate_matching_games():
 
             # Add log id to the history table
             cursor.execute(
-                f"UPDATE history SET log = {log_last} WHERE id = {history_last}"
+                "UPDATE history SET log = %s WHERE id = %s", (log_last, history_last)
             )
 
         try:
@@ -873,7 +890,7 @@ def match_fileset(data_arr, username=None, skiplog=False):
     detection = src == "scummvm"
     source_status = "detection" if detection else src
 
-    conn.cursor().execute(f"SET @fileset_time_last = {int(time.time())}")
+    conn.cursor().execute("SET @fileset_time_last = %s", (int(time.time()),))
 
     with conn.cursor() as cursor:
         cursor.execute("SELECT MAX(`transaction`) FROM transactions")
@@ -1280,7 +1297,8 @@ def update_all_files(fileset, candidate_fileset_id, is_candidate_detection, conn
     with conn.cursor() as cursor:
         # Extracting the filename from the filepath.
         cursor.execute(
-            f"SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = {candidate_fileset_id}"
+            "SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = %s",
+            (candidate_fileset_id,),
         )
         target_files = cursor.fetchall()
         candidate_files = {
@@ -2417,13 +2435,13 @@ def find_matching_filesets(fileset, conn, status):
                     checksize, checktype, checksum = get_checksum_props(
                         checktype, checksum
                     )
-                    query = f"""SELECT DISTINCT fs.id AS fileset_id
+                    query = """SELECT DISTINCT fs.id AS fileset_id
                                 FROM fileset fs
                                 JOIN file f ON fs.id = f.fileset
                                 JOIN filechecksum fc ON f.id = fc.file
-                                WHERE fc.checksum = '{checksum}' AND fc.checktype = '{checktype}'
-                                AND fs.status IN ({state})"""
-                    cursor.execute(query)
+                                WHERE fc.checksum = %s AND fc.checktype = %s
+                                AND fs.status IN (%s)"""
+                    cursor.execute(query, (checksum, checktype, state))
                     records = cursor.fetchall()
                     if records:
                         for record in records:
@@ -2446,16 +2464,16 @@ def matching_set(fileset, conn):
                     checksum = checksum.split(":")[1]
                 size = file["size"]
 
-                query = f"""
+                query = """
                     SELECT DISTINCT fs.id AS fileset_id
                     FROM fileset fs
                     JOIN file f ON fs.id = f.fileset
                     JOIN filechecksum fc ON f.id = fc.file
-                    WHERE fc.checksum = '{checksum}' AND fc.checktype LIKE 'md5%'
-                    AND fc.checksize > {size}
+                    WHERE fc.checksum = %s AND fc.checktype LIKE 'md5%'
+                    AND fc.checksize > %s
                     AND fs.status = 'detection'
                 """
-                cursor.execute(query)
+                cursor.execute(query, (checksum, size))
                 records = cursor.fetchall()
                 if records:
                     for record in records:
@@ -2485,11 +2503,12 @@ def handle_matched_filesets(
             if is_full_matched:
                 break
             cursor.execute(
-                f"SELECT status FROM fileset WHERE id = {matched_fileset_id}"
+                "SELECT status FROM fileset WHERE id = %s", (matched_fileset_id,)
             )
             status = cursor.fetchone()["status"]
             cursor.execute(
-                f"SELECT COUNT(file.id) FROM file WHERE fileset = {matched_fileset_id}"
+                "SELECT COUNT(file.id) FROM file WHERE fileset = %s",
+                (matched_fileset_id,),
             )
             count = cursor.fetchone()["COUNT(file.id)"]
 
@@ -2535,28 +2554,31 @@ def handle_matched_filesets(
 
 def delete_original_fileset(fileset_id, conn):
     with conn.cursor() as cursor:
-        cursor.execute(f"DELETE FROM file WHERE fileset = {fileset_id}")
-        cursor.execute(f"DELETE FROM fileset WHERE id = {fileset_id}")
+        cursor.execute("DELETE FROM file WHERE fileset = %s", (fileset_id,))
+        cursor.execute("DELETE FROM fileset WHERE id = %s", (fileset_id,))
     conn.commit()
 
 
 def update_fileset_status(cursor, fileset_id, status):
-    cursor.execute(f"""
+    cursor.execute(
+        """
         UPDATE fileset SET 
-            status = '{status}', 
-            `timestamp` = FROM_UNIXTIME({int(time.time())})
-        WHERE id = {fileset_id}
-    """)
+            status = %s, 
+            `timestamp` = FROM_UNIXTIME(%s)
+        WHERE id = %s
+    """,
+        (status, int(time.time()), fileset_id),
+    )
 
 
 def populate_file(fileset, fileset_id, conn, detection):
     with conn.cursor() as cursor:
-        cursor.execute(f"SELECT * FROM file WHERE fileset = {fileset_id}")
+        cursor.execute("SELECT * FROM file WHERE fileset = %s", (fileset_id,))
         target_files = cursor.fetchall()
         target_files_dict = {}
         for target_file in target_files:
             cursor.execute(
-                f"SELECT * FROM filechecksum WHERE file = {target_file['id']}"
+                "SELECT * FROM filechecksum WHERE file = %s", (target_file["id"],)
             )
             target_checksums = cursor.fetchall()
             for checksum in target_checksums:
@@ -2681,7 +2703,8 @@ def set_populate_file(fileset, fileset_id, conn, detection):
     with conn.cursor() as cursor:
         # Extracting the filename from the filepath.
         cursor.execute(
-            f"SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = {fileset_id}"
+            "SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = %s",
+            (fileset_id,),
         )
         target_files = cursor.fetchall()
         candidate_files = {
@@ -2723,23 +2746,17 @@ def set_populate_file(fileset, fileset_id, conn, detection):
             ):
                 name = normalised_path(file["name"])
                 values = [name]
-
                 values.append(file["size"] if "size" in file else "0")
                 values.append(file["size-r"] if "size-r" in file else "0")
                 values.append(file["size-rd"] if "size-rd" in file else "0")
-
                 values.extend([checksum, fileset_id, detection, "None"])
 
-                placeholders = (
-                    ["%s"] * (len(values[:5])) + ["%s"] + ["%s"] * 2 + ["NOW()"]
-                )
-                query = f"INSERT INTO file ( name, size, `size-r`, `size-rd`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ({', '.join(placeholders)})"
+                query = "INSERT INTO file ( name, size, `size-r`, `size-rd`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, NOW())"
 
                 cursor.execute(query, values)
                 cursor.execute("SET @file_last = LAST_INSERT_ID()")
                 cursor.execute("SELECT @file_last AS file_id")
 
-                cursor.execute("SELECT @file_last AS file_id")
                 file_id = cursor.fetchone()["file_id"]
 
                 insert_filechecksum(file, "md5", file_id, conn)
@@ -2877,7 +2894,8 @@ def finalize_fileset_insertion(
 ):
     with conn.cursor() as cursor:
         cursor.execute(
-            f"SELECT COUNT(fileset) from transactions WHERE `transaction` = {transaction_id}"
+            "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s",
+            (transaction_id,),
         )
         fileset_insertion_count = cursor.fetchone()["COUNT(fileset)"]
         category_text = f"Uploaded from {src}"
@@ -2915,7 +2933,7 @@ def user_integrity_check(data, ip, game_metadata=None):
         print(f"Failed to connect to database: {e}")
         return
 
-    conn.cursor().execute(f"SET @fileset_time_last = {int(time.time())}")
+    conn.cursor().execute("SET @fileset_time_last = %s", (int(time.time()),))
 
     try:
         with conn.cursor() as cursor:
@@ -2940,12 +2958,13 @@ def user_integrity_check(data, ip, game_metadata=None):
             missing_set = set()
 
             for fileset_id in matched_map.keys():
-                cursor.execute(f"SELECT * FROM file WHERE fileset = {fileset_id}")
+                cursor.execute("SELECT * FROM file WHERE fileset = %s", (fileset_id,))
                 target_files = cursor.fetchall()
                 target_files_dict = {}
                 for target_file in target_files:
                     cursor.execute(
-                        f"SELECT * FROM filechecksum WHERE file = {target_file['id']}"
+                        "SELECT * FROM filechecksum WHERE file = %s",
+                        (target_file["id"],),
                     )
                     target_checksums = cursor.fetchall()
                     for checksum in target_checksums:
@@ -3025,12 +3044,13 @@ def user_integrity_check(data, ip, game_metadata=None):
             most_matched = matched_list[0]
             matched_fileset_id, matched_count = most_matched[0], most_matched[1]
             cursor.execute(
-                f"SELECT status FROM fileset WHERE id = {matched_fileset_id}"
+                "SELECT status FROM fileset WHERE id = %s", (matched_fileset_id,)
             )
             status = cursor.fetchone()["status"]
 
             cursor.execute(
-                f"SELECT COUNT(file.id) FROM file WHERE fileset = {matched_fileset_id}"
+                "SELECT COUNT(file.id) FROM file WHERE fileset = %s",
+                (matched_fileset_id,),
             )
             count = cursor.fetchone()["COUNT(file.id)"]
             if status == "full" and count == matched_count:
@@ -3068,13 +3088,14 @@ def user_integrity_check(data, ip, game_metadata=None):
 def add_usercount(fileset, conn):
     with conn.cursor() as cursor:
         cursor.execute(
-            f"UPDATE fileset SET user_count = COALESCE(user_count, 0) + 1 WHERE id = {fileset}"
+            "UPDATE fileset SET user_count = COALESCE(user_count, 0) + 1 WHERE id = %s",
+            (fileset,),
         )
-        cursor.execute(f"SELECT user_count from fileset WHERE id = {fileset}")
+        cursor.execute("SELECT user_count from fileset WHERE id = %s", (fileset,))
         count = cursor.fetchone()["user_count"]
         if count >= 3:
             cursor.execute(
-                f"UPDATE fileset SET status = 'ReadyForReview' WHERE id = {fileset}"
+                "UPDATE fileset SET status = 'ReadyForReview' WHERE id = %s", (fileset,)
             )