From 161abd5cb33b739fa40618768a9349634a03ad58 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Fri, 4 Jul 2025 19:52:16 +0530 Subject: [PATCH 01/30] INTEGRITY: Increase character limit size for log text --- schema.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/schema.py b/schema.py index 09cbe7f..2ebfeeb 100644 --- a/schema.py +++ b/schema.py @@ -202,6 +202,10 @@ cursor.execute( "ALTER TABLE file MODIFY COLUMN `size-r` BIGINT DEFAULT 0, MODIFY COLUMN `size-rd` BIGINT DEFAULT 0;" ) +try: + cursor.execute("ALTER TABLE log ADD COLUMN `text` varchar(1000);") +except Exception: + cursor.execute("ALTER TABLE log MODIFY COLUMN `text` varchar(1000);") for index, definition in indices.items(): From 94cd1d98a34c2a2c22b45e0ae9076dcbf7d54e44 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Fri, 4 Jul 2025 19:54:58 +0530 Subject: [PATCH 02/30] INTEGRITY: Separate the additional checksum add logic from insert_filechecksum. Add new equal checksums for set.dat's fileset match --- db_functions.py | 55 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/db_functions.py b/db_functions.py index 45adc5d..de74d58 100644 --- a/db_functions.py +++ b/db_functions.py @@ -233,26 +233,34 @@ def insert_file(file, detection, src, conn): cursor.execute("SET @file_last = LAST_INSERT_ID()") -def insert_filechecksum(file, checktype, conn): +def insert_filechecksum(file, checktype, file_id, conn): if checktype not in file: return checksum = file[checktype] checksize, checktype, checksum = get_checksum_props(checktype, checksum) - query = f"INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (@file_last, '{checksize}', '{checktype}', '{checksum}')" + query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)" + with conn.cursor() as cursor: + cursor.execute(query, (file_id, checksize, checktype, checksum)) + + add_all_equal_checksums(checksize, checktype, checksum, file_id, conn) + + +def add_all_equal_checksums(checksize, checktype, checksum, file_id, conn): + """ + We can update all the checksums when file size is less than the checksum size type, as all checksums are equal in that case. + """ with conn.cursor() as cursor: - cursor.execute(query) if "md5" not in checktype: return - size_name = "size" if checktype[-1] == "r": size_name += "-rd" if checktype[-1] == "s": size_name += "-d" - cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = @file_last") + cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = {file_id}") result = cursor.fetchone() if not result: return @@ -281,9 +289,10 @@ def insert_filechecksum(file, checktype, conn): checksum_size = exploded.pop() checksum_type = "-".join(exploded) - query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (@file_last, %s, %s, %s)" - with conn.cursor() as cursor: - cursor.execute(query, (checksum_size, checksum_type, checksum)) + query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)" + cursor.execute( + query, (file_id, checksum_size, checksum_type, checksum) + ) def delete_filesets(conn): @@ -558,9 +567,13 @@ def db_insert(data_arr, username=None, skiplog=False): for file in unique_files: insert_file(file, detection, src, conn) + file_id = None + with conn.cursor() as cursor: + cursor.execute("SELECT @file_last AS file_id") + file_id = cursor.fetchone()["file_id"] for key, value in file.items(): if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]: - insert_filechecksum(file, key, conn) + insert_filechecksum(file, key, file_id, conn) if detection: conn.cursor().execute( @@ -1070,7 +1083,6 @@ def set_perform_match( with conn.cursor() as cursor: if len(candidate_filesets) == 1: matched_fileset_id = candidate_filesets[0] - cursor.execute( "SELECT status FROM fileset WHERE id = %s", (matched_fileset_id,) ) @@ -1123,6 +1135,7 @@ def set_perform_match( elif len(candidate_filesets) > 1: found_match = False + for candidate_fileset in candidate_filesets: (is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn) if is_match: @@ -1579,7 +1592,7 @@ def populate_file(fileset, fileset_id, conn, detection): for key, value in file.items(): if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]: - insert_filechecksum(file, key, conn) + insert_filechecksum(file, key, file_id, conn) if value in target_files_dict and not file_exists: cursor.execute( f"SELECT detection_type FROM file WHERE id = {target_files_dict[value]['id']}" @@ -1683,7 +1696,10 @@ def set_populate_file(fileset, fileset_id, conn, detection): cursor.execute("SET @file_last = LAST_INSERT_ID()") cursor.execute("SELECT @file_last AS file_id") - insert_filechecksum(file, "md5", conn) + cursor.execute("SELECT @file_last AS file_id") + file_id = cursor.fetchone()["file_id"] + + insert_filechecksum(file, "md5", file_id, conn) else: query = """ @@ -1701,6 +1717,7 @@ def set_populate_file(fileset, fileset_id, conn, detection): candidate_files[filename.lower()][0], ), ) + query = """ INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s) @@ -1714,6 +1731,14 @@ def set_populate_file(fileset, fileset_id, conn, detection): checksum, ), ) + + add_all_equal_checksums( + checksize, + checktype, + checksum, + candidate_files[filename.lower()][0], + conn, + ) seen_detection_files.add((filename.lower(), file["size"])) @@ -1745,9 +1770,13 @@ def insert_new_fileset( if fileset_id: for file in fileset["rom"]: insert_file(file, detection, src, conn) + file_id = None + with conn.cursor() as cursor: + cursor.execute("SELECT @file_last AS file_id") + file_id = cursor.fetchone()["file_id"] for key, value in file.items(): if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]: - insert_filechecksum(file, key, conn) + insert_filechecksum(file, key, file_id, conn) return (fileset_id, existing) From cc82c6f065cbeea1e7867b203dd4e3b0f8c3527a Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Fri, 4 Jul 2025 23:37:48 +0530 Subject: [PATCH 03/30] INTEGRITY: Add filtering by platform for set.dat to reduce manual merge. --- db_functions.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/db_functions.py b/db_functions.py index de74d58..f405f41 100644 --- a/db_functions.py +++ b/db_functions.py @@ -1028,6 +1028,12 @@ def set_process( for fileset_id, candidate_filesets in set_to_candidate_dict.items(): fileset = id_to_fileset_dict[fileset_id] + + # Filter by platform to reduce manual merge + candidate_filesets = set_filter_by_platform( + fileset["name"], candidate_filesets, conn + ) + ( fully_matched_filesets, auto_merged_filesets, @@ -1063,6 +1069,47 @@ def set_process( create_log(escape_string(category_text), user, escape_string(log_text), conn) +def set_filter_by_platform(gameid, candidate_filesets, conn): + """ + Return - list(number) : list of fileset ids of filtered candidates. + The number of manual merges in case the file size is not present (equal to -1) are too high. So we try to filter by platform extracted from the gameId of the set.dat fileset. We may disable this feature later or keep it optional with a command line argument. + """ + with conn.cursor() as cursor: + # e.g. sq2-coco3-1 + possible_platform_names = gameid.split("-")[1:] + + # Align platform names in set.dat and detection entries + for i, platform in enumerate(possible_platform_names): + if platform == "win": + possible_platform_names[i] = "windows" + elif platform == "mac": + possible_platform_names[i] = "macintosh" + + filtered_candidate_fileset = [] + + for candidate_fileset_id in candidate_filesets: + query = """ + SELECT g.platform + FROM fileset fs + JOIN game g ON g.id = fs.game + WHERE fs.id = %s + """ + cursor.execute(query, (candidate_fileset_id,)) + candidate_platform = cursor.fetchone()["platform"] + if candidate_platform in possible_platform_names: + filtered_candidate_fileset.append(candidate_fileset_id) + + if len(filtered_candidate_fileset) != 0: + print(len(candidate_filesets), " ", len(filtered_candidate_fileset), "\n") + + # If nothing was filtred, then it is likely, that platform information was not present, so we fallback to original list of candidates. + return ( + candidate_filesets + if len(filtered_candidate_fileset) == 0 + else filtered_candidate_fileset + ) + + def set_perform_match( fileset, src, From 599d2f1219a46e18265e2358b4fd2ef7a588902b Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Mon, 7 Jul 2025 18:45:18 +0530 Subject: [PATCH 04/30] INTEGRITY: Avoid adding a fileset as candidate if it was marked as partial in the same run. --- db_functions.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/db_functions.py b/db_functions.py index f405f41..3b18fd6 100644 --- a/db_functions.py +++ b/db_functions.py @@ -1034,6 +1034,16 @@ def set_process( fileset["name"], candidate_filesets, conn ) + for candidate_fileset in candidate_filesets: + with conn.cursor() as cursor: + cursor.execute( + "SELECT id FROM fileset WHERE status = 'current' AND id = %s", + (candidate_fileset), + ) + result = cursor.fetchone() + if result: + candidate_filesets.remove(candidate_fileset) + ( fully_matched_filesets, auto_merged_filesets, @@ -1056,6 +1066,8 @@ def set_process( # Final log with conn.cursor() as cursor: + cursor.execute("UPDATE fileset SET status = 'partial' WHERE status = 'current'") + cursor.execute( "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s", (transaction_id,), @@ -1099,9 +1111,6 @@ def set_filter_by_platform(gameid, candidate_filesets, conn): if candidate_platform in possible_platform_names: filtered_candidate_fileset.append(candidate_fileset_id) - if len(filtered_candidate_fileset) != 0: - print(len(candidate_filesets), " ", len(filtered_candidate_fileset), "\n") - # If nothing was filtred, then it is likely, that platform information was not present, so we fallback to original list of candidates. return ( candidate_filesets @@ -1135,7 +1144,7 @@ def set_perform_match( ) status = cursor.fetchone()["status"] if status == "detection": - update_fileset_status(cursor, matched_fileset_id, "partial") + update_fileset_status(cursor, matched_fileset_id, "current") set_populate_file(fileset, matched_fileset_id, conn, detection) auto_merged_filesets += 1 if not skiplog: @@ -1186,7 +1195,7 @@ def set_perform_match( for candidate_fileset in candidate_filesets: (is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn) if is_match: - update_fileset_status(cursor, candidate_fileset, "partial") + update_fileset_status(cursor, candidate_fileset, "current") set_populate_file(fileset, candidate_fileset, conn, detection) auto_merged_filesets += 1 if not skiplog: From 6ea2bbae514a151cb0cec1f78e7525afdd87bcbe Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Mon, 7 Jul 2025 22:45:20 +0530 Subject: [PATCH 05/30] INTEGRITY: Add additional filtering logic for glk engines --- db_functions.py | 127 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 115 insertions(+), 12 deletions(-) diff --git a/db_functions.py b/db_functions.py index 3b18fd6..8800a8b 100644 --- a/db_functions.py +++ b/db_functions.py @@ -965,9 +965,17 @@ def set_process( if existing: continue - candidate_filesets = set_filter_candidate_filesets( - fileset_id, fileset, transaction_id, conn - ) + # Separating out the matching logic for glk engine + engine_name = fileset["sourcefile"].split("-")[0] + + if engine_name == "glk": + candidate_filesets = set_glk_filter_candidate_filesets( + fileset_id, fileset, transaction_id, engine_name, conn + ) + else: + candidate_filesets = set_filter_candidate_filesets( + fileset_id, fileset, transaction_id, conn + ) # Mac files in set.dat are not represented properly and they won't find a candidate fileset for a match, so we can drop them. if len(candidate_filesets) == 0: @@ -1288,6 +1296,84 @@ def is_full_checksum_match(candidate_fileset, fileset, conn): return (len(unmatched_files) == 0, unmatched_files) +def set_glk_filter_candidate_filesets( + fileset_id, fileset, transaction_id, engine_name, conn +): + """ + Returns a list of candidate filesets for glk engines that can be merged + """ + with conn.cursor() as cursor: + # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches + + query = """ + WITH candidate_fileset AS ( + SELECT fs.id AS fileset_id, f.size + FROM file f + JOIN fileset fs ON f.fileset = fs.id + JOIN game g ON g.id = fs.game + JOIN engine e ON e.id = g.engine + JOIN transactions t ON t.fileset = fs.id + WHERE fs.id != %s + AND e.engineid = %s + AND f.detection = 1 + AND t.transaction != %s + AND (g.gameid = %s OR (g.gameid != %s AND g.gameid LIKE %s)) + ), + total_detection_files AS ( + SELECT cf.fileset_id, COUNT(*) AS detection_files_found + FROM candidate_fileset cf + GROUP BY fileset_id + ), + set_fileset AS ( + SELECT size FROM file + WHERE fileset = %s + ), + matched_detection_files AS ( + SELECT cf.fileset_id, COUNT(*) AS match_files_count + FROM candidate_fileset cf + JOIN set_fileset sf ON + cf.size = sf.size OR cf.size = 0 + GROUP BY cf.fileset_id + ), + valid_matched_detection_files AS ( + SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count + FROM matched_detection_files mdf + JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id + WHERE tdf.detection_files_found <= mdf.match_files_count + ), + max_match_count AS ( + SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files + ) + SELECT vmdf.fileset_id + FROM valid_matched_detection_files vmdf + JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id + JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count + """ + + gameid_pattern = f"%{fileset['name']}%" + + cursor.execute( + query, + ( + fileset_id, + engine_name, + transaction_id, + fileset["name"], + fileset["name"], + gameid_pattern, + fileset_id, + ), + ) + rows = cursor.fetchall() + + candidates = [] + if rows: + for row in rows: + candidates.append(row["fileset_id"]) + + return candidates + + def set_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn): """ Returns a list of candidate filesets that can be merged @@ -1715,6 +1801,13 @@ def set_populate_file(fileset, fileset_id, conn, detection): for target_file in target_files } + # For glk engines + candidate_file_size = { + target_file["size"]: target_file["id"] for target_file in target_files + } + + engine_name = fileset["sourcefile"].split("-")[0] + seen_detection_files = set() for file in fileset["rom"]: @@ -1724,13 +1817,16 @@ def set_populate_file(fileset, fileset_id, conn, detection): filename = os.path.basename(normalised_path(file["name"])) - if ((filename.lower(), file["size"]) in seen_detection_files) or ( - filename.lower() not in candidate_files + if (engine_name == "glk" and file["size"] not in candidate_file_size) and ( + (filename.lower(), file["size"]) in seen_detection_files or ( - filename.lower() in candidate_files - and ( - candidate_files[filename.lower()][1] != -1 - and candidate_files[filename.lower()][1] != file["size"] + filename.lower() not in candidate_files + or ( + filename.lower() in candidate_files + and ( + candidate_files[filename.lower()][1] != -1 + and candidate_files[filename.lower()][1] != file["size"] + ) ) ) ): @@ -1764,13 +1860,16 @@ def set_populate_file(fileset, fileset_id, conn, detection): name = %s WHERE id = %s """ + # Filtering was by filename, but we are still updating the file with the original filepath. cursor.execute( query, ( file["size"], normalised_path(file["name"]), - candidate_files[filename.lower()][0], + candidate_files[filename.lower()][0] + if engine_name != "glk" + else candidate_file_size[file["size"]], ), ) @@ -1781,7 +1880,9 @@ def set_populate_file(fileset, fileset_id, conn, detection): cursor.execute( query, ( - candidate_files[filename.lower()][0], + candidate_files[filename.lower()][0] + if engine_name != "glk" + else candidate_file_size[file["size"]], checksize, checktype, checksum, @@ -1792,7 +1893,9 @@ def set_populate_file(fileset, fileset_id, conn, detection): checksize, checktype, checksum, - candidate_files[filename.lower()][0], + candidate_files[filename.lower()][0] + if engine_name != "glk" + else candidate_file_size[file["size"]], conn, ) seen_detection_files.add((filename.lower(), file["size"])) From 3028d19c84d822efc2b30cd220fc2472d0f7a36e Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Tue, 8 Jul 2025 02:35:31 +0530 Subject: [PATCH 06/30] INTEGRITY: Add timestamp field in scan.dat and filtering support via modification time --- compute_hash.py | 56 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/compute_hash.py b/compute_hash.py index 0067cfd..b5c743a 100644 --- a/compute_hash.py +++ b/compute_hash.py @@ -4,6 +4,8 @@ import struct import sys from enum import Enum +from datetime import datetime, date +from collections import defaultdict class FileType(Enum): NON_MAC = "non_mac" @@ -154,7 +156,6 @@ def is_actual_resource_fork_mac(filepath): """ Returns boolean, checking the actual mac fork if it exists. """ resource_fork_path = os.path.join(filepath, "..namedfork", "rsrc") - print(resource_fork_path) return os.path.exists(resource_fork_path) def is_appledouble(file_byte_stream): @@ -505,7 +506,7 @@ def file_filter(files): for file in to_be_deleted: del files[file] -def compute_hash_of_dirs(root_directory, depth, size=0, alg="md5"): +def compute_hash_of_dirs(root_directory, depth, size=0, limit_timestamps_date=None, alg="md5"): """ Return dictionary containing checksums of all files in directory """ res = [] @@ -518,10 +519,14 @@ def compute_hash_of_dirs(root_directory, depth, size=0, alg="md5"): for root, _, contents in os.walk(directory): files.extend([os.path.join(root, f) for f in contents]) + # Filter out the files based on user input date - limit_timestamps_date + filtered_file_map = filter_files_by_timestamp(files, limit_timestamp_date) + # Produce filetype and filename(name to be used in game entry) for each file - for filepath in files: + for filepath in filtered_file_map: file_collection[filepath] = file_classification(filepath) + # Remove extra entries of macfiles to avoid extra checksum calculation in form of non mac files # Checksum for both the forks are calculated using a single file, so other files should be removed from the collection file_filter(file_collection) @@ -538,11 +543,45 @@ def compute_hash_of_dirs(root_directory, depth, size=0, alg="md5"): relative_dir = os.path.dirname(os.path.dirname(relative_path)) relative_path = os.path.join(relative_dir, base_name) - hash_of_dir[relative_path] = file_checksum(file_path, alg, size, file_info) + hash_of_dir[relative_path] = file_checksum(file_path, alg, size, file_info) + (filtered_file_map[file_path],) res.append(hash_of_dir) return res + +def validate_date(date_str): + """ + Confirms if the user provided timestamp is in a valid format. + Returns the date as a datetime object. + """ + formats = ["%Y-%m-%d", "%Y-%m", "%Y"] + for fmt in formats: + try: + return datetime.strptime(date_str, fmt).date() + except ValueError: + continue + raise ValueError("Invalid date format. Use YYYY, YYYY-MM, or YYYY-MM-DD") + + +def filter_files_by_timestamp(files, limit_timestamps_date): + """ + Removes the files those were modified after a certain timestamp provided by the user. + The files those were modified today are kept. + Returns filtered map with filepath and its modification time + """ + + filtered_file_map = defaultdict(str) + user_date = validate_date(limit_timestamps_date) + today = date.today() + + for filepath in files: + mtime = datetime.fromtimestamp(os.path.getmtime(filepath)).date() + if limit_timestamps_date is None or (limit_timestamps_date is not None and (mtime <= user_date or mtime == today)): + filtered_file_map[filepath] = str(mtime) + + return filtered_file_map + + def create_dat_file(hash_of_dirs, path, checksum_size=0): with open(f"{os.path.basename(path)}.dat", "w") as file: # Header @@ -556,8 +595,8 @@ def create_dat_file(hash_of_dirs, path, checksum_size=0): # Game files for hash_of_dir in hash_of_dirs: file.write("game (\n") - for filename, (hashes, filesize) in hash_of_dir.items(): - data = f"name \"{filename}\" size {filesize}" + for filename, (hashes, filesize, timestamp) in hash_of_dir.items(): + data = f"name \"{filename}\" size {filesize} timestamp {timestamp}" for key, value in hashes: data += f" {key} {value}" @@ -579,10 +618,13 @@ def error(self, message): help="Depth from root to game directories") parser.add_argument("--size", help="Use first n bytes of file to calculate checksum") +parser.add_argument("--limit-timestamps", + help="Format - YYYY-MM-DD or YYYY-MM or YYYY. Filters out the files those were modified after the given timestamp. Note that if the modification time is today, it would not be filtered out.") args = parser.parse_args() path = os.path.abspath(args.directory) if args.directory else os.getcwd() depth = int(args.depth) if args.depth else 0 checksum_size = int(args.size) if args.size else 0 +limit_timestamp_date = str(args.limit_timestamps) if args.limit_timestamps else None create_dat_file(compute_hash_of_dirs( - path, depth, checksum_size), path, checksum_size) + path, depth, checksum_size, limit_timestamp_date), path, checksum_size) From 19b19c0224d6d332a991bbd66a3d63d3982779e9 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Tue, 8 Jul 2025 03:17:42 +0530 Subject: [PATCH 07/30] INTEGRITY: Add all size variants to scan.dat - size, size-r and size-rd. --- compute_hash.py | 89 +++++++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/compute_hash.py b/compute_hash.py index b5c743a..3b0e155 100644 --- a/compute_hash.py +++ b/compute_hash.py @@ -179,7 +179,7 @@ def is_appledouble(file_byte_stream): return True def macbin_get_resfork_data(file_byte_stream): - """ Returns the resource fork's data section as bytes of a macbinary file as well as its size """ + """ Returns the resource fork's data section as bytes, data fork size (size), resource fork size (size-r) and data section of resource fork size (size-rd) of a macbinary file """ if not file_byte_stream: return file_byte_stream @@ -189,10 +189,10 @@ def macbin_get_resfork_data(file_byte_stream): (rsrclen,) = struct.unpack(">I", file_byte_stream[0x57:0x5B]) resoure_fork_offset = 128 + datalen_padded - data_offset = int.from_bytes(file_byte_stream[resoure_fork_offset+0 : resoure_fork_offset+4]) - data_length = int.from_bytes(file_byte_stream[resoure_fork_offset+8 : resoure_fork_offset+12]) + rd_offset = int.from_bytes(file_byte_stream[resoure_fork_offset+0 : resoure_fork_offset+4]) + rd_length = int.from_bytes(file_byte_stream[resoure_fork_offset+8 : resoure_fork_offset+12]) - return (file_byte_stream[resoure_fork_offset + data_offset: resoure_fork_offset + data_offset + data_length], data_length) + return (file_byte_stream[resoure_fork_offset + rd_offset: resoure_fork_offset + rd_offset + rd_length], datalen, rsrclen, rd_length) def macbin_get_datafork(file_byte_stream): if not file_byte_stream: @@ -222,7 +222,7 @@ def is_appledouble(file_byte_stream): return True def appledouble_get_resfork_data(file_byte_stream): - """ Returns the resource fork's data section as bytes of an appledouble file as well as its size """ + """ Returns the resource fork's data section as bytes, size of resource fork (size-r) and size of data section of resource fork (size-rd) of an appledouble file""" entry_count = read_be_16(file_byte_stream[24:]) for entry in range(entry_count): @@ -233,13 +233,13 @@ def appledouble_get_resfork_data(file_byte_stream): if id == 2: resource_fork_stream = file_byte_stream[offset:offset+length] - data_offset = int.from_bytes(resource_fork_stream[0:4]) - data_length = int.from_bytes(resource_fork_stream[8:12]) + rd_offset = int.from_bytes(resource_fork_stream[0:4]) + rd_length = int.from_bytes(resource_fork_stream[8:12]) - return (resource_fork_stream[data_offset: data_offset+data_length], data_length) + return (resource_fork_stream[rd_offset: rd_offset+rd_length], length, rd_length) def appledouble_get_datafork(filepath, fileinfo): - """ Returns data fork's content as bytes of appledouble file if found, otherwise empty byte string """ + """ Returns data fork's content as bytes and size of data fork of an appledouble file.""" try: index = filepath.index("__MACOSX") except ValueError: @@ -253,50 +253,54 @@ def appledouble_get_datafork(filepath, fileinfo): try: with open(data_fork_path, "rb") as f: - return f.read() + data = f.read() + return (data, len(data)) except (FileNotFoundError, IsADirectoryError): return b'' def raw_rsrc_get_datafork(filepath): - """ Returns the data fork's content as bytes corresponding to raw rsrc file. """ + """ Returns the data fork's content as bytes and size of the data fork corresponding to raw rsrc file. """ try: with open(filepath[:-5]+".data", "rb") as f: - return f.read() + data = f.read() + return (data, len(data)) except (FileNotFoundError, IsADirectoryError): return b'' def raw_rsrc_get_resource_fork_data(filepath): - """ Returns the resource fork's data section as bytes of a raw rsrc file as well as its size """ + """ Returns the resource fork's data section as bytes, size of resource fork (size-r) and size of data section of resource fork (size-rd) of a raw rsrc file.""" with open(filepath, "rb") as f: resource_fork_stream = f.read() - data_offset = int.from_bytes(resource_fork_stream[0:4]) - data_length = int.from_bytes(resource_fork_stream[8:12]) + resource_fork_len = len(resource_fork_stream) + rd_offset = int.from_bytes(resource_fork_stream[0:4]) + rd_length = int.from_bytes(resource_fork_stream[8:12]) - return (resource_fork_stream[data_offset: data_offset+data_length], data_length) + return (resource_fork_stream[rd_offset: rd_offset+rd_length], resource_fork_len, rd_length) def actual_mac_fork_get_data_fork(filepath): - """ Returns the data fork's content as bytes if the actual mac fork exists """ + """ Returns the data fork's content as bytes and its size if the actual mac fork exists """ try: with open(filepath, "rb") as f: - return f.read() + data = f.read() + return (data, len(data)) except (FileNotFoundError, IsADirectoryError): return b'' def actual_mac_fork_get_resource_fork_data(filepath): - """ Returns the resource fork's data section as bytes of the actual mac fork as well as its size """ + """ Returns the resource fork's data section as bytes, size of resource fork (size-r) and size of data section of resource fork (size-rd) of the actual mac fork.""" resource_fork_path = os.path.join(filepath, "..namedfork", "rsrc") with open(resource_fork_path, "rb") as f: resource_fork_stream = f.read() - data_offset = int.from_bytes(resource_fork_stream[0:4]) - data_length = int.from_bytes(resource_fork_stream[8:12]) + resource_fork_len = len(resource_fork_stream) + rd_offset = int.from_bytes(resource_fork_stream[0:4]) + rd_length = int.from_bytes(resource_fork_stream[8:12]) - return (resource_fork_stream[data_offset: data_offset+data_length], data_length) + return (resource_fork_stream[rd_offset: rd_offset+rd_length], resource_fork_len, rd_length) -def file_checksum(filepath, alg, size, file_info): - cur_file_size = 0 +def file_checksum(filepath, alg, custom_checksum_size, file_info): with open(filepath, "rb") as f: if file_info[0] == FileType.NON_MAC: - return (create_checksum_pairs(checksum(f, alg, size, filepath), alg, size), filesize(filepath)) + return (create_checksum_pairs(checksum(f, alg, custom_checksum_size, filepath), alg, custom_checksum_size), filesize(filepath), 0, 0) # Processing mac files res = [] @@ -304,29 +308,33 @@ def file_checksum(filepath, alg, size, file_info): datafork = b'' file_data = f.read() + size = 0 + size_r = 0 + size_rd = 0 + if file_info[0] == FileType.MAC_BINARY: - (resfork, cur_file_size) = macbin_get_resfork_data(file_data) + (resfork, size, size_r, size_rd) = macbin_get_resfork_data(file_data) datafork = macbin_get_datafork(file_data) elif file_info[0] in {FileType.APPLE_DOUBLE_DOT_, FileType.APPLE_DOUBLE_RSRC, FileType.APPLE_DOUBLE_MACOSX}: - (resfork, cur_file_size) = appledouble_get_resfork_data(file_data) - datafork = appledouble_get_datafork(filepath, file_info) + (resfork, size_r, size_rd) = appledouble_get_resfork_data(file_data) + (datafork, size) = appledouble_get_datafork(filepath, file_info) elif file_info[0] == FileType.RAW_RSRC: - (resfork, cur_file_size) = raw_rsrc_get_resource_fork_data(filepath) - datafork = raw_rsrc_get_datafork(filepath) + (resfork, size_r, size_rd) = raw_rsrc_get_resource_fork_data(filepath) + datafork, size = raw_rsrc_get_datafork(filepath) elif file_info[0] == FileType.ACTUAL_FORK_MAC: - (resfork, cur_file_size) = actual_mac_fork_get_resource_fork_data(filepath) - datafork = actual_mac_fork_get_data_fork(filepath) + (resfork, size_r, size_rd) = actual_mac_fork_get_resource_fork_data(filepath) + (datafork, size) = actual_mac_fork_get_data_fork(filepath) - hashes = checksum(resfork, alg, size, filepath) + hashes = checksum(resfork, alg, custom_checksum_size, filepath) prefix = 'r' if len(resfork): - res.extend(create_checksum_pairs(hashes, alg, size, prefix)) + res.extend(create_checksum_pairs(hashes, alg, custom_checksum_size, prefix)) - hashes = checksum(datafork, alg, size, filepath) + hashes = checksum(datafork, alg, custom_checksum_size, filepath) prefix = 'd' - res.extend(create_checksum_pairs(hashes, alg, size, prefix)) + res.extend(create_checksum_pairs(hashes, alg, custom_checksum_size, prefix)) - return (res, cur_file_size) + return (res, size, size_r, size_rd) def create_checksum_pairs(hashes, alg, size, prefix=None): res = [] @@ -571,7 +579,8 @@ def filter_files_by_timestamp(files, limit_timestamps_date): """ filtered_file_map = defaultdict(str) - user_date = validate_date(limit_timestamps_date) + if limit_timestamp_date is not None: + user_date = validate_date(limit_timestamps_date) today = date.today() for filepath in files: @@ -595,8 +604,8 @@ def create_dat_file(hash_of_dirs, path, checksum_size=0): # Game files for hash_of_dir in hash_of_dirs: file.write("game (\n") - for filename, (hashes, filesize, timestamp) in hash_of_dir.items(): - data = f"name \"{filename}\" size {filesize} timestamp {timestamp}" + for filename, (hashes, size, size_r, size_rd, timestamp) in hash_of_dir.items(): + data = f"name \"{filename}\" size {size} size-r {size_r} size-rd {size_rd} timestamp {timestamp}" for key, value in hashes: data += f" {key} {value}" From 074da92875fda1751230c50c1c1df953a1721850 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Tue, 8 Jul 2025 19:52:49 +0530 Subject: [PATCH 08/30] INTEGRITY: Fix clear database hang issue. Now the database is dropped and recreated. --- db_functions.py | 18 ++ fileset.py | 20 +- schema.py | 570 ++++++++++++++++++++++++------------------------ 3 files changed, 314 insertions(+), 294 deletions(-) diff --git a/db_functions.py b/db_functions.py index 8800a8b..393906b 100644 --- a/db_functions.py +++ b/db_functions.py @@ -32,6 +32,24 @@ def db_connect(): return conn +def db_connect_root(): + base_dir = os.path.dirname(os.path.abspath(__file__)) + config_path = os.path.join(base_dir, "mysql_config.json") + with open(config_path) as f: + mysql_cred = json.load(f) + + conn = pymysql.connect( + host=mysql_cred["servername"], + user=mysql_cred["username"], + password=mysql_cred["password"], + charset="utf8mb4", + cursorclass=pymysql.cursors.DictCursor, + autocommit=True, + ) + + return (conn, mysql_cred["dbname"]) + + def get_checksum_props(checkcode, checksum): checksize = 0 checktype = checkcode diff --git a/fileset.py b/fileset.py index 9b9dc93..3f7bc40 100644 --- a/fileset.py +++ b/fileset.py @@ -23,8 +23,10 @@ user_integrity_check, db_connect, create_log, + db_connect_root, ) from collections import defaultdict +from schema import init_database app = Flask(__name__) @@ -79,21 +81,13 @@ def index(): @app.route("/clear_database", methods=["POST"]) def clear_database(): try: - conn = db_connect() + (conn, db_name) = db_connect_root() with conn.cursor() as cursor: - cursor.execute("SET FOREIGN_KEY_CHECKS = 0;") - cursor.execute("TRUNCATE TABLE filechecksum") - cursor.execute("TRUNCATE TABLE history") - cursor.execute("TRUNCATE TABLE transactions") - cursor.execute("TRUNCATE TABLE queue") - cursor.execute("TRUNCATE TABLE file") - cursor.execute("TRUNCATE TABLE fileset") - cursor.execute("TRUNCATE TABLE game") - cursor.execute("TRUNCATE TABLE engine") - cursor.execute("TRUNCATE TABLE log") - cursor.execute("SET FOREIGN_KEY_CHECKS = 1;") + cursor.execute(f"DROP DATABASE IF EXISTS {db_name}") conn.commit() - print("DATABASE CLEARED") + print("DATABASE DROPPED") + init_database() + print("DATABASE INITIALISED") except Exception as e: print(f"Error clearing database: {e}") finally: diff --git a/schema.py b/schema.py index 2ebfeeb..826be7b 100644 --- a/schema.py +++ b/schema.py @@ -5,302 +5,310 @@ from datetime import datetime import os -# Load MySQL credentials -base_dir = os.path.dirname(os.path.abspath(__file__)) -config_path = os.path.join(base_dir, "mysql_config.json") -with open(config_path) as f: - mysql_cred = json.load(f) - -servername = mysql_cred["servername"] -username = mysql_cred["username"] -password = mysql_cred["password"] -dbname = mysql_cred["dbname"] - -# Create connection -conn = pymysql.connect( - host=servername, - user=username, - password=password, - charset="utf8mb4", - cursorclass=pymysql.cursors.DictCursor, - autocommit=False, -) - -# Check connection -if conn is None: - print("Error connecting to MySQL") - exit(1) - -cursor = conn.cursor() - -# Create database -sql = f"CREATE DATABASE IF NOT EXISTS {dbname}" -cursor.execute(sql) - -# Use database -cursor.execute(f"USE {dbname}") - -# Create tables -tables = { - "engine": """ - CREATE TABLE IF NOT EXISTS engine ( - id INT AUTO_INCREMENT PRIMARY KEY, - name VARCHAR(200), - engineid VARCHAR(100) NOT NULL - ) - """, - "game": """ - CREATE TABLE IF NOT EXISTS game ( - id INT AUTO_INCREMENT PRIMARY KEY, - name VARCHAR(200), - engine INT NOT NULL, - gameid VARCHAR(100) NOT NULL, - extra VARCHAR(200), - platform VARCHAR(30), - language VARCHAR(10), - FOREIGN KEY (engine) REFERENCES engine(id) - ) - """, - "fileset": """ - CREATE TABLE IF NOT EXISTS fileset ( - id INT AUTO_INCREMENT PRIMARY KEY, - game INT, - status VARCHAR(20), - src VARCHAR(20), - `key` VARCHAR(64), - `megakey` VARCHAR(64), - `delete` BOOLEAN DEFAULT FALSE NOT NULL, - `timestamp` TIMESTAMP NOT NULL, - detection_size INT, - FOREIGN KEY (game) REFERENCES game(id) - ) - """, - "file": """ - CREATE TABLE IF NOT EXISTS file ( - id INT AUTO_INCREMENT PRIMARY KEY, - name VARCHAR(200) NOT NULL, - size BIGINT NOT NULL, - checksum VARCHAR(64) NOT NULL, - fileset INT NOT NULL, - detection BOOLEAN NOT NULL, - FOREIGN KEY (fileset) REFERENCES fileset(id) ON DELETE CASCADE - ) - """, - "filechecksum": """ - CREATE TABLE IF NOT EXISTS filechecksum ( - id INT AUTO_INCREMENT PRIMARY KEY, - file INT NOT NULL, - checksize VARCHAR(10) NOT NULL, - checktype VARCHAR(10) NOT NULL, - checksum VARCHAR(64) NOT NULL, - FOREIGN KEY (file) REFERENCES file(id) ON DELETE CASCADE - ) - """, - "queue": """ - CREATE TABLE IF NOT EXISTS queue ( - id INT AUTO_INCREMENT PRIMARY KEY, - time TIMESTAMP NOT NULL, - notes varchar(300), - fileset INT, - userid INT NOT NULL, - commit VARCHAR(64) NOT NULL, - FOREIGN KEY (fileset) REFERENCES fileset(id) - ) - """, - "log": """ - CREATE TABLE IF NOT EXISTS log ( - id INT AUTO_INCREMENT PRIMARY KEY, - `timestamp` TIMESTAMP NOT NULL, - category VARCHAR(100) NOT NULL, - user VARCHAR(100) NOT NULL, - `text` varchar(300) - ) - """, - "history": """ - CREATE TABLE IF NOT EXISTS history ( - id INT AUTO_INCREMENT PRIMARY KEY, - `timestamp` TIMESTAMP NOT NULL, - fileset INT NOT NULL, - oldfileset INT NOT NULL, - log INT - ) - """, - "transactions": """ - CREATE TABLE IF NOT EXISTS transactions ( - id INT AUTO_INCREMENT PRIMARY KEY, - `transaction` INT NOT NULL, - fileset INT NOT NULL - ) - """, - "possible_merges": """ - CREATE TABLE IF NOT EXISTS possible_merges ( - id INT AUTO_INCREMENT PRIMARY KEY, - child_fileset INT, - parent_fileset INT, - FOREIGN KEY (child_fileset) REFERENCES fileset(id) ON DELETE CASCADE, - FOREIGN KEY (parent_fileset) REFERENCES fileset(id) ON DELETE CASCADE - ) - """, -} -for table, definition in tables.items(): - try: - cursor.execute(definition) - print(f"Table '{table}' created successfully") - except pymysql.Error as err: - print(f"Error creating '{table}' table: {err}") - -# Create indices -indices = { - "detection": "CREATE INDEX detection ON file (detection)", - "checksum": "CREATE INDEX checksum ON filechecksum (checksum)", - "engineid": "CREATE INDEX engineid ON engine (engineid)", - "key": "CREATE INDEX fileset_key ON fileset (`key`)", - "status": "CREATE INDEX status ON fileset (status)", - "fileset": "CREATE INDEX fileset ON history (fileset)", - "file_name_size": "CREATE INDEX file_name_size ON file (name, size)", - "file_fileset_detection": "CREATE INDEX file_fileset_detection ON file (fileset, detection)", -} - -try: - cursor.execute("ALTER TABLE file ADD COLUMN detection_type VARCHAR(20);") -except Exception: - # if aleady exists, change the length of the column - cursor.execute("ALTER TABLE file MODIFY COLUMN detection_type VARCHAR(20);") - -try: - cursor.execute("ALTER TABLE file ADD COLUMN `timestamp` TIMESTAMP NOT NULL;") -except Exception: - # if aleady exists, change the length of the column - cursor.execute("ALTER TABLE file MODIFY COLUMN `timestamp` TIMESTAMP NOT NULL;") - -try: - cursor.execute("ALTER TABLE fileset ADD COLUMN `user_count` INT;") -except Exception: - # if aleady exists, change the length of the column - cursor.execute("ALTER TABLE fileset MODIFY COLUMN `user_count` INT;") - -try: - cursor.execute("ALTER TABLE file ADD COLUMN punycode_name VARCHAR(200);") -except Exception: - cursor.execute("ALTER TABLE file MODIFY COLUMN punycode_name VARCHAR(200);") - -try: - cursor.execute( - "ALTER TABLE file ADD COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';" - ) -except Exception: - cursor.execute( - "ALTER TABLE file MODIFY COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';" +def init_database(): + # Load MySQL credentials + base_dir = os.path.dirname(os.path.abspath(__file__)) + config_path = os.path.join(base_dir, "mysql_config.json") + with open(config_path) as f: + mysql_cred = json.load(f) + + servername = mysql_cred["servername"] + username = mysql_cred["username"] + password = mysql_cred["password"] + dbname = mysql_cred["dbname"] + + # Create connection + conn = pymysql.connect( + host=servername, + user=username, + password=password, + charset="utf8mb4", + cursorclass=pymysql.cursors.DictCursor, + autocommit=False, ) -try: - cursor.execute( - "ALTER TABLE file ADD COLUMN `size-r` BIGINT DEFAULT 0, ADD COLUMN `size-rd` BIGINT DEFAULT 0;" - ) -except Exception: - cursor.execute( - "ALTER TABLE file MODIFY COLUMN `size-r` BIGINT DEFAULT 0, MODIFY COLUMN `size-rd` BIGINT DEFAULT 0;" - ) -try: - cursor.execute("ALTER TABLE log ADD COLUMN `text` varchar(1000);") -except Exception: - cursor.execute("ALTER TABLE log MODIFY COLUMN `text` varchar(1000);") + # Check connection + if conn is None: + print("Error connecting to MySQL") + exit(1) + + cursor = conn.cursor() + + # Create database + sql = f"CREATE DATABASE IF NOT EXISTS {dbname}" + cursor.execute(sql) + + # Use database + cursor.execute(f"USE {dbname}") + + # Create tables + tables = { + "engine": """ + CREATE TABLE IF NOT EXISTS engine ( + id INT AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(200), + engineid VARCHAR(100) NOT NULL + ) + """, + "game": """ + CREATE TABLE IF NOT EXISTS game ( + id INT AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(200), + engine INT NOT NULL, + gameid VARCHAR(100) NOT NULL, + extra VARCHAR(200), + platform VARCHAR(30), + language VARCHAR(10), + FOREIGN KEY (engine) REFERENCES engine(id) + ) + """, + "fileset": """ + CREATE TABLE IF NOT EXISTS fileset ( + id INT AUTO_INCREMENT PRIMARY KEY, + game INT, + status VARCHAR(20), + src VARCHAR(20), + `key` VARCHAR(64), + `megakey` VARCHAR(64), + `delete` BOOLEAN DEFAULT FALSE NOT NULL, + `timestamp` TIMESTAMP NOT NULL, + detection_size INT, + FOREIGN KEY (game) REFERENCES game(id) + ) + """, + "file": """ + CREATE TABLE IF NOT EXISTS file ( + id INT AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(200) NOT NULL, + size BIGINT NOT NULL, + checksum VARCHAR(64) NOT NULL, + fileset INT NOT NULL, + detection BOOLEAN NOT NULL, + FOREIGN KEY (fileset) REFERENCES fileset(id) ON DELETE CASCADE + ) + """, + "filechecksum": """ + CREATE TABLE IF NOT EXISTS filechecksum ( + id INT AUTO_INCREMENT PRIMARY KEY, + file INT NOT NULL, + checksize VARCHAR(10) NOT NULL, + checktype VARCHAR(10) NOT NULL, + checksum VARCHAR(64) NOT NULL, + FOREIGN KEY (file) REFERENCES file(id) ON DELETE CASCADE + ) + """, + "queue": """ + CREATE TABLE IF NOT EXISTS queue ( + id INT AUTO_INCREMENT PRIMARY KEY, + time TIMESTAMP NOT NULL, + notes varchar(300), + fileset INT, + userid INT NOT NULL, + commit VARCHAR(64) NOT NULL, + FOREIGN KEY (fileset) REFERENCES fileset(id) + ) + """, + "log": """ + CREATE TABLE IF NOT EXISTS log ( + id INT AUTO_INCREMENT PRIMARY KEY, + `timestamp` TIMESTAMP NOT NULL, + category VARCHAR(100) NOT NULL, + user VARCHAR(100) NOT NULL, + `text` varchar(300) + ) + """, + "history": """ + CREATE TABLE IF NOT EXISTS history ( + id INT AUTO_INCREMENT PRIMARY KEY, + `timestamp` TIMESTAMP NOT NULL, + fileset INT NOT NULL, + oldfileset INT NOT NULL, + log INT + ) + """, + "transactions": """ + CREATE TABLE IF NOT EXISTS transactions ( + id INT AUTO_INCREMENT PRIMARY KEY, + `transaction` INT NOT NULL, + fileset INT NOT NULL + ) + """, + "possible_merges": """ + CREATE TABLE IF NOT EXISTS possible_merges ( + id INT AUTO_INCREMENT PRIMARY KEY, + child_fileset INT, + parent_fileset INT, + FOREIGN KEY (child_fileset) REFERENCES fileset(id) ON DELETE CASCADE, + FOREIGN KEY (parent_fileset) REFERENCES fileset(id) ON DELETE CASCADE + ) + """, + } + + for table, definition in tables.items(): + try: + cursor.execute(definition) + print(f"Table '{table}' created successfully") + except pymysql.Error as err: + print(f"Error creating '{table}' table: {err}") + + # Create indices + indices = { + "detection": "CREATE INDEX detection ON file (detection)", + "checksum": "CREATE INDEX checksum ON filechecksum (checksum)", + "engineid": "CREATE INDEX engineid ON engine (engineid)", + "key": "CREATE INDEX fileset_key ON fileset (`key`)", + "status": "CREATE INDEX status ON fileset (status)", + "fileset": "CREATE INDEX fileset ON history (fileset)", + "file_name_size": "CREATE INDEX file_name_size ON file (name, size)", + "file_fileset_detection": "CREATE INDEX file_fileset_detection ON file (fileset, detection)", + } - -for index, definition in indices.items(): try: - cursor.execute(definition) - print(f"Created index for '{index}'") - except pymysql.Error as err: - print(f"Error creating index for '{index}': {err}") - - -# Insert random data into tables -def random_string(length=10): - return "".join(random.choices(string.ascii_letters + string.digits, k=length)) - - -def insert_random_data(): - for _ in range(1000): - # Insert data into engine - cursor.execute( - "INSERT INTO engine (name, engineid) VALUES (%s, %s)", - (random_string(), random_string()), - ) + cursor.execute("ALTER TABLE file ADD COLUMN detection_type VARCHAR(20);") + except Exception: + # if aleady exists, change the length of the column + cursor.execute("ALTER TABLE file MODIFY COLUMN detection_type VARCHAR(20);") - # Insert data into game - cursor.execute( - "INSERT INTO game (name, engine, gameid, extra, platform, language) VALUES (%s, %s, %s, %s, %s, %s)", - ( - random_string(), - 1, - random_string(), - random_string(), - random_string(), - random_string(), - ), - ) - - # Insert data into fileset - cursor.execute( - "INSERT INTO fileset (game, status, src, `key`, `megakey`, `timestamp`, detection_size) VALUES (%s, %s, %s, %s, %s, %s, %s)", - ( - 1, - "user", - random_string(), - random_string(), - random_string(), - datetime.now(), - random.randint(1, 100), - ), - ) + try: + cursor.execute("ALTER TABLE file ADD COLUMN `timestamp` TIMESTAMP NOT NULL;") + except Exception: + # if aleady exists, change the length of the column + cursor.execute("ALTER TABLE file MODIFY COLUMN `timestamp` TIMESTAMP NOT NULL;") - # Insert data into file - cursor.execute( - "INSERT INTO file (name, size, checksum, fileset, detection) VALUES (%s, %s, %s, %s, %s)", - (random_string(), random.randint(1000, 10000), random_string(), 1, True), - ) + try: + cursor.execute("ALTER TABLE fileset ADD COLUMN `user_count` INT;") + except Exception: + # if aleady exists, change the length of the column + cursor.execute("ALTER TABLE fileset MODIFY COLUMN `user_count` INT;") - # Insert data into filechecksum - cursor.execute( - "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)", - (1, random_string(), random_string(), random_string()), - ) + try: + cursor.execute("ALTER TABLE file ADD COLUMN punycode_name VARCHAR(200);") + except Exception: + cursor.execute("ALTER TABLE file MODIFY COLUMN punycode_name VARCHAR(200);") - # Insert data into queue + try: cursor.execute( - "INSERT INTO queue (time, notes, fileset, userid, commit) VALUES (%s, %s, %s, %s, %s)", - ( - datetime.now(), - random_string(), - 1, - random.randint(1, 100), - random_string(), - ), + "ALTER TABLE file ADD COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';" ) - - # Insert data into log + except Exception: cursor.execute( - "INSERT INTO log (`timestamp`, category, user, `text`) VALUES (%s, %s, %s, %s)", - (datetime.now(), random_string(), random_string(), random_string()), + "ALTER TABLE file MODIFY COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';" ) - # Insert data into history + try: cursor.execute( - "INSERT INTO history (`timestamp`, fileset, oldfileset, log) VALUES (%s, %s, %s, %s)", - (datetime.now(), 1, 2, 1), + "ALTER TABLE file ADD COLUMN `size-r` BIGINT DEFAULT 0, ADD COLUMN `size-rd` BIGINT DEFAULT 0;" ) - - # Insert data into transactions + except Exception: cursor.execute( - "INSERT INTO transactions (`transaction`, fileset) VALUES (%s, %s)", - (random.randint(1, 100), 1), + "ALTER TABLE file MODIFY COLUMN `size-r` BIGINT DEFAULT 0, MODIFY COLUMN `size-rd` BIGINT DEFAULT 0;" ) - - -# for testing locally -# insert_random_data() - -conn.commit() -conn.close() + try: + cursor.execute("ALTER TABLE log ADD COLUMN `text` varchar(5000);") + except Exception: + cursor.execute("ALTER TABLE log MODIFY COLUMN `text` varchar(5000);") + + for index, definition in indices.items(): + try: + cursor.execute(definition) + print(f"Created index for '{index}'") + except pymysql.Error as err: + print(f"Error creating index for '{index}': {err}") + + # Insert random data into tables + def random_string(length=10): + return "".join(random.choices(string.ascii_letters + string.digits, k=length)) + + def insert_random_data(): + for _ in range(1000): + # Insert data into engine + cursor.execute( + "INSERT INTO engine (name, engineid) VALUES (%s, %s)", + (random_string(), random_string()), + ) + + # Insert data into game + cursor.execute( + "INSERT INTO game (name, engine, gameid, extra, platform, language) VALUES (%s, %s, %s, %s, %s, %s)", + ( + random_string(), + 1, + random_string(), + random_string(), + random_string(), + random_string(), + ), + ) + + # Insert data into fileset + cursor.execute( + "INSERT INTO fileset (game, status, src, `key`, `megakey`, `timestamp`, detection_size) VALUES (%s, %s, %s, %s, %s, %s, %s)", + ( + 1, + "user", + random_string(), + random_string(), + random_string(), + datetime.now(), + random.randint(1, 100), + ), + ) + + # Insert data into file + cursor.execute( + "INSERT INTO file (name, size, checksum, fileset, detection) VALUES (%s, %s, %s, %s, %s)", + ( + random_string(), + random.randint(1000, 10000), + random_string(), + 1, + True, + ), + ) + + # Insert data into filechecksum + cursor.execute( + "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)", + (1, random_string(), random_string(), random_string()), + ) + + # Insert data into queue + cursor.execute( + "INSERT INTO queue (time, notes, fileset, userid, commit) VALUES (%s, %s, %s, %s, %s)", + ( + datetime.now(), + random_string(), + 1, + random.randint(1, 100), + random_string(), + ), + ) + + # Insert data into log + cursor.execute( + "INSERT INTO log (`timestamp`, category, user, `text`) VALUES (%s, %s, %s, %s)", + (datetime.now(), random_string(), random_string(), random_string()), + ) + + # Insert data into history + cursor.execute( + "INSERT INTO history (`timestamp`, fileset, oldfileset, log) VALUES (%s, %s, %s, %s)", + (datetime.now(), 1, 2, 1), + ) + + # Insert data into transactions + cursor.execute( + "INSERT INTO transactions (`transaction`, fileset) VALUES (%s, %s)", + (random.randint(1, 100), 1), + ) + + # for testing locally + # insert_random_data() + + conn.commit() + conn.close() + + +if __name__ == "__main__": + init_database() From 31b7d4ffc19fb2ea3cb9948d87060f62fba5dbc9 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Tue, 8 Jul 2025 23:39:53 +0530 Subject: [PATCH 09/30] INTEGRITY: Remove global database connection object from fileset.py, which is never closed. --- fileset.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/fileset.py b/fileset.py index 3f7bc40..dc91d33 100644 --- a/fileset.py +++ b/fileset.py @@ -32,21 +32,6 @@ secret_key = os.urandom(24) -base_dir = os.path.dirname(os.path.abspath(__file__)) -config_path = os.path.join(base_dir, "mysql_config.json") -with open(config_path) as f: - mysql_cred = json.load(f) - -conn = pymysql.connect( - host=mysql_cred["servername"], - user=mysql_cred["username"], - password=mysql_cred["password"], - db=mysql_cred["dbname"], - charset="utf8mb4", - cursorclass=pymysql.cursors.DictCursor, - autocommit=False, -) - @app.route("/") def index(): @@ -348,7 +333,7 @@ def fileset(): html += "Description\n" html += "Log Text\n" - related_filesets = get_all_related_filesets(id, conn) + related_filesets = get_all_related_filesets(id, connection) cursor.execute( f"SELECT * FROM history WHERE fileset IN ({','.join(map(str, related_filesets))}) OR oldfileset IN ({','.join(map(str, related_filesets))})" @@ -971,9 +956,12 @@ def validate(): del json_response["files"] json_response["status"] = "no_metadata" - fileset_id = user_insert_fileset(json_object, ip, conn) + conn = db_connect() + try: + fileset_id = user_insert_fileset(json_object, ip, conn) + finally: + conn.close() json_response["fileset"] = fileset_id - print(f"Response: {json_response}") return jsonify(json_response) matched_map = {} From e125227f3e94c705f81b965f2e6556da63b403eb Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Thu, 10 Jul 2025 00:04:52 +0530 Subject: [PATCH 10/30] INTEGRITY: Filter manual merge candidates if size mismatch. --- db_functions.py | 169 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 135 insertions(+), 34 deletions(-) diff --git a/db_functions.py b/db_functions.py index 393906b..77fa453 100644 --- a/db_functions.py +++ b/db_functions.py @@ -1052,6 +1052,8 @@ def set_process( del set_to_candidate_dict[set_fileset] del id_to_fileset_dict[set_fileset] + manual_merge_map = defaultdict(list) + for fileset_id, candidate_filesets in set_to_candidate_dict.items(): fileset = id_to_fileset_dict[fileset_id] @@ -1060,16 +1062,6 @@ def set_process( fileset["name"], candidate_filesets, conn ) - for candidate_fileset in candidate_filesets: - with conn.cursor() as cursor: - cursor.execute( - "SELECT id FROM fileset WHERE status = 'current' AND id = %s", - (candidate_fileset), - ) - result = cursor.fetchone() - if result: - candidate_filesets.remove(candidate_fileset) - ( fully_matched_filesets, auto_merged_filesets, @@ -1086,14 +1078,31 @@ def set_process( auto_merged_filesets, manual_merged_filesets, mismatch_filesets, + manual_merge_map, + set_to_candidate_dict, conn, skiplog, ) + # print(manual_merge_map) + + for fileset_id, candidates in manual_merge_map.items(): + category_text = "Manual Merge Required" + log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidates)}." + manual_merged_filesets += 1 + # print(candidates) + add_manual_merge( + candidates, + fileset_id, + category_text, + log_text, + log_text, + user, + conn, + ) + # Final log with conn.cursor() as cursor: - cursor.execute("UPDATE fileset SET status = 'partial' WHERE status = 'current'") - cursor.execute( "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s", (transaction_id,), @@ -1156,6 +1165,8 @@ def set_perform_match( auto_merged_filesets, manual_merged_filesets, mismatch_filesets, + manual_merge_map, + set_to_candidate_dict, conn, skiplog, ): @@ -1170,7 +1181,7 @@ def set_perform_match( ) status = cursor.fetchone()["status"] if status == "detection": - update_fileset_status(cursor, matched_fileset_id, "current") + update_fileset_status(cursor, matched_fileset_id, "parital") set_populate_file(fileset, matched_fileset_id, conn, detection) auto_merged_filesets += 1 if not skiplog: @@ -1183,6 +1194,9 @@ def set_perform_match( conn, ) delete_original_fileset(fileset_id, conn) + remove_manual_merge_if_size_mismatch( + matched_fileset_id, manual_merge_map, set_to_candidate_dict, conn + ) elif status == "partial" or status == "full": (is_match, unmatched_files) = is_full_checksum_match( matched_fileset_id, fileset, conn @@ -1221,7 +1235,7 @@ def set_perform_match( for candidate_fileset in candidate_filesets: (is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn) if is_match: - update_fileset_status(cursor, candidate_fileset, "current") + update_fileset_status(cursor, candidate_fileset, "partial") set_populate_file(fileset, candidate_fileset, conn, detection) auto_merged_filesets += 1 if not skiplog: @@ -1234,22 +1248,14 @@ def set_perform_match( conn, ) delete_original_fileset(fileset_id, conn) + remove_manual_merge_if_size_mismatch( + candidate_fileset, manual_merge_map, set_to_candidate_dict, conn + ) found_match = True break if not found_match: - category_text = "Manual Merge Required" - log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidate_filesets)}." - manual_merged_filesets += 1 - add_manual_merge( - candidate_filesets, - fileset_id, - category_text, - log_text, - log_text, - user, - conn, - ) + manual_merge_map[fileset_id] = candidate_filesets return ( fully_matched_filesets, @@ -1259,6 +1265,98 @@ def set_perform_match( ) +def remove_manual_merge_if_size_mismatch( + child_fileset, manual_merge_map, set_to_candidate_dict, conn +): + with conn.cursor() as cursor: + query = """ + SELECT f.name, f.size + FROM fileset fs + JOIN file f ON f.fileset = fs.id + WHERE fs.id = %s + AND f.detection = 1 + """ + cursor.execute(query, (child_fileset,)) + files = cursor.fetchall() + + for parent_fileset, child_list in manual_merge_map.items(): + if child_fileset not in child_list: + continue + + for file in files: + if file["size"] == -1: + continue + + query = """ + SELECT f.id + FROM fileset fs + JOIN file f ON f.fileset = fs.id + WHERE fs.id = %s + AND f.name = %s + AND f.size = %s + """ + cursor.execute(query, (parent_fileset, file["name"], file["size"])) + result = cursor.fetchall() + + if not result: + remove_manual_merge( + child_fileset, + parent_fileset, + manual_merge_map, + set_to_candidate_dict, + conn, + ) + break + + for parent_fileset, child_list in set_to_candidate_dict.items(): + if child_fileset not in child_list: + continue + + for file in files: + if file["size"] == -1: + continue + + query = """ + SELECT f.id + FROM fileset fs + JOIN file f ON f.fileset = fs.id + WHERE fs.id = %s + AND f.name = %s + AND f.size = %s + """ + cursor.execute(query, (parent_fileset, file["name"], file["size"])) + result = cursor.fetchall() + + if not result: + remove_manual_merge( + child_fileset, + parent_fileset, + manual_merge_map, + set_to_candidate_dict, + conn, + ) + break + + +def remove_manual_merge( + child_fileset, parent_fileset, manual_merge_map, set_to_candidate_dict, conn +): + if parent_fileset in manual_merge_map: + if child_fileset in manual_merge_map[parent_fileset]: + manual_merge_map[parent_fileset].remove(child_fileset) + if parent_fileset in set_to_candidate_dict: + if child_fileset in set_to_candidate_dict[parent_fileset]: + set_to_candidate_dict[parent_fileset].remove(child_fileset) + + with conn.cursor() as cursor: + query = """ + DELETE FROM possible_merges + WHERE child_fileset = %s + AND parent_fileset = %s + """ + cursor.execute(query, (child_fileset, parent_fileset)) + + def add_manual_merge( child_filesets, parent_fileset, category_text, log_text, print_text, user, conn ): @@ -1835,15 +1933,18 @@ def set_populate_file(fileset, fileset_id, conn, detection): filename = os.path.basename(normalised_path(file["name"])) - if (engine_name == "glk" and file["size"] not in candidate_file_size) and ( - (filename.lower(), file["size"]) in seen_detection_files - or ( - filename.lower() not in candidate_files + if (engine_name == "glk" and file["size"] not in candidate_file_size) or ( + engine_name != "glk" + and ( + (filename.lower(), file["size"]) in seen_detection_files or ( - filename.lower() in candidate_files - and ( - candidate_files[filename.lower()][1] != -1 - and candidate_files[filename.lower()][1] != file["size"] + filename.lower() not in candidate_files + or ( + filename.lower() in candidate_files + and ( + candidate_files[filename.lower()][1] != -1 + and candidate_files[filename.lower()][1] != file["size"] + ) ) ) ) From 898ffd064332840409f9a1070b13a2e0e8a6ff20 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Thu, 10 Jul 2025 16:18:34 +0530 Subject: [PATCH 11/30] INTEGRITY: Add metadata for set.dat --- db_functions.py | 12 +++++++++++- fileset.py | 26 ++++++++++++++++++++------ schema.py | 9 +++++++++ 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/db_functions.py b/db_functions.py index 77fa453..2e022e3 100644 --- a/db_functions.py +++ b/db_functions.py @@ -107,6 +107,7 @@ def insert_fileset( transaction, log_text, conn, + set_dat_metadata="", ip="", username=None, skiplog=None, @@ -162,7 +163,7 @@ def insert_fileset( return (existing_entry, True) # $game and $key should not be parsed as a mysql string, hence no quotes - query = f"INSERT INTO fileset (game, status, src, `key`, megakey, `timestamp`) VALUES ({game}, '{status}', '{src}', {key}, {megakey}, FROM_UNIXTIME(@fileset_time_last))" + query = f"INSERT INTO fileset (game, status, src, `key`, megakey, `timestamp`, set_dat_metadata) VALUES ({game}, '{status}', '{src}', {key}, {megakey}, FROM_UNIXTIME(@fileset_time_last), '{escape_string(set_dat_metadata)}')" fileset_id = -1 with conn.cursor() as cursor: cursor.execute(query) @@ -968,6 +969,11 @@ def set_process( megakey = "" log_text = f"State {source_status}." + set_dat_metadata = "" + for meta in fileset: + if meta != "rom": + set_dat_metadata += meta + " = " + fileset[meta] + " , " + (fileset_id, existing) = insert_new_fileset( fileset, conn, @@ -978,8 +984,10 @@ def set_process( transaction_id, log_text, user, + set_dat_metadata=set_dat_metadata, skiplog=skiplog, ) + if existing: continue @@ -2030,6 +2038,7 @@ def insert_new_fileset( transaction_id, log_text, user, + set_dat_metadata="", ip="", skiplog=False, ): @@ -2042,6 +2051,7 @@ def insert_new_fileset( log_text, conn, username=user, + set_dat_metadata=set_dat_metadata, ip=ip, skiplog=skiplog, ) diff --git a/fileset.py b/fileset.py index dc91d33..85bc093 100644 --- a/fileset.py +++ b/fileset.py @@ -155,18 +155,32 @@ def fileset(): (id,), ) row = cursor.fetchone() - print(row) if row: id = row["fileset"] - cursor.execute(f"SELECT * FROM fileset WHERE id = {id}") + cursor.execute("SELECT status FROM fileset WHERE id = %s", (id,)) + status = cursor.fetchone()["status"] + + if status == "dat": + cursor.execute( + """SELECT id, game, status, src, `key`, megakey, `delete`, timestamp, set_dat_metadata FROM fileset WHERE id = %s""", + (id,), + ) + else: + cursor.execute( + """SELECT id, game, status, src, `key`, megakey, `delete`, timestamp, detection_size, user_count FROM fileset WHERE id = %s""", + (id,), + ) + result = cursor.fetchone() - print(result) html += "

Fileset details

" html += "\n" if result["game"]: - cursor.execute( - f"SELECT game.name as 'game name', engineid, gameid, extra, platform, language FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = {id}" - ) + if status == "dat": + query = """SELECT game.name as 'game name', engineid, gameid, extra, platform, language, fileset.set_dat_metadata FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = %s""" + else: + query = """SELECT game.name as 'game name', engineid, gameid, extra, platform, language FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = %s""" + print(query) + cursor.execute(query, (id,)) result = {**result, **cursor.fetchone()} else: # result.pop('key', None) diff --git a/schema.py b/schema.py index 826be7b..4b9ba42 100644 --- a/schema.py +++ b/schema.py @@ -209,6 +209,15 @@ def init_database(): except Exception: cursor.execute("ALTER TABLE log MODIFY COLUMN `text` varchar(5000);") + try: + cursor.execute( + "ALTER TABLE fileset ADD COLUMN set_dat_metadata varchar(5000) DEFAULT 'UTF-8';" + ) + except Exception: + cursor.execute( + "ALTER TABLE fileset MODIFY COLUMN set_dat_metadata varchar(5000) DEFAULT 'UTF-8';" + ) + for index, definition in indices.items(): try: cursor.execute(definition) From 8970cd6a9a04074ccb836f532b499613ccd8b447 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Thu, 10 Jul 2025 18:30:28 +0530 Subject: [PATCH 12/30] INTEGRITY: Add navbar with logo. --- fileset.py | 35 ++++++++++++++++++ pagination.py | 5 +++ favicon-16x16.png => static/favicon-16x16.png | Bin favicon-32x32.png => static/favicon-32x32.png | Bin .../integrity_service_logo_256.png | Bin 5 files changed, 40 insertions(+) rename favicon-16x16.png => static/favicon-16x16.png (100%) rename favicon-32x32.png => static/favicon-32x32.png (100%) rename integrity_service_logo_256.png => static/integrity_service_logo_256.png (100%) diff --git a/fileset.py b/fileset.py index 85bc093..436652d 100644 --- a/fileset.py +++ b/fileset.py @@ -42,6 +42,11 @@ def index(): +

Fileset Database

Fileset Actions

    @@ -138,6 +143,11 @@ def fileset(): +

    Fileset: {id}

""" @@ -466,6 +476,11 @@ def match_fileset_route(id): +

Matched Filesets for Fileset: {id}

@@ -553,6 +568,11 @@ def merge_fileset(id): +

Search Results for '{search_query}'

@@ -587,6 +607,11 @@ def merge_fileset(id): +

Search Fileset to Merge

@@ -641,6 +666,11 @@ def possible_merge_filesets(id): +

Possible Merges for fileset-'{id}'

@@ -748,6 +778,11 @@ def highlight_differences(source, target): +

Confirm Merge

IDGame NamePlatformLanguageExtraDetailsAction
diff --git a/pagination.py b/pagination.py index cb8ba3d..28b82f5 100644 --- a/pagination.py +++ b/pagination.py @@ -141,6 +141,11 @@ def create_page( +
FieldSource FilesetTarget Fileset
""" diff --git a/favicon-16x16.png b/static/favicon-16x16.png similarity index 100% rename from favicon-16x16.png rename to static/favicon-16x16.png diff --git a/favicon-32x32.png b/static/favicon-32x32.png similarity index 100% rename from favicon-32x32.png rename to static/favicon-32x32.png diff --git a/integrity_service_logo_256.png b/static/integrity_service_logo_256.png similarity index 100% rename from integrity_service_logo_256.png rename to static/integrity_service_logo_256.png From bd3f2f4ece9f06d3fa05da1d1153974c1456cae1 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Thu, 10 Jul 2025 19:44:37 +0530 Subject: [PATCH 13/30] INTEGRITY: Add modification timestamps for macfiles --- compute_hash.py | 77 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/compute_hash.py b/compute_hash.py index 3b0e155..94bf18b 100644 --- a/compute_hash.py +++ b/compute_hash.py @@ -4,7 +4,7 @@ import struct import sys from enum import Enum -from datetime import datetime, date +from datetime import datetime, date, timedelta from collections import defaultdict class FileType(Enum): @@ -75,9 +75,10 @@ def get_dirs_at_depth(directory, depth): if depth == num_sep_this - num_sep: yield root -def read_be_32(byte_stream): +def read_be_32(byte_stream, signed=False): """ Return unsigned integer of size_in_bits, assuming the data is big-endian """ - (uint,) = struct.unpack(">I", byte_stream[:32//8]) + format = ">i" if signed else ">I" + (uint,) = struct.unpack(format, byte_stream[:32//8]) return uint def read_be_16(byte_stream): @@ -534,7 +535,6 @@ def compute_hash_of_dirs(root_directory, depth, size=0, limit_timestamps_date=No for filepath in filtered_file_map: file_collection[filepath] = file_classification(filepath) - # Remove extra entries of macfiles to avoid extra checksum calculation in form of non mac files # Checksum for both the forks are calculated using a single file, so other files should be removed from the collection file_filter(file_collection) @@ -557,6 +557,70 @@ def compute_hash_of_dirs(root_directory, depth, size=0, limit_timestamps_date=No return res +def extract_macbin_mtime(file_byte_stream): + """ + Returns modification time of macbinary file from the header. + Doc - +$5f / 4: modification date/time. + Doc - Timestamps are unsigned 32-bit values indicating the time in seconds since midnight on Jan 1, 1904, in local time. + """ + macbin_epoch = datetime(1904, 1, 1) + header = file_byte_stream[:128] + macbin_seconds = read_be_32(header[0x5f:]) + return (macbin_epoch + timedelta(seconds=macbin_seconds)).date() + + +def extract_mtime_appledouble(file_byte_stream): + """ + Returns modification time of appledouble file. + Doc 1 - The File Dates Info entry (ID=8) consists of the file creation, modification, backup + and access times (see Figure 2-1), stored as a signed number of seconds before + or after 12:00 a.m. (midnight), January 1, 2000 Greenwich Mean Time (GMT) + + Doc 2 - + struct ASFileDates /* entry ID 8, file dates info */ + { + sint32 create; /* file creation date/time */ + sint32 modify; /* last modification date/time */ + sint32 backup; /* last backup date/time */ + sint32 access; /* last access date/time */ + }; /* ASFileDates */ + """ + entry_count = read_be_16(file_byte_stream[24:]) + for entry in range(entry_count): + start_index = 26 + entry*12 + id = read_be_32(file_byte_stream[start_index:]) + offset = read_be_32(file_byte_stream[start_index+4:]) + length = read_be_32(file_byte_stream[start_index+8:]) + + if id == 8: + date_info_data = file_byte_stream[offset:offset + length] + if len(date_info_data) < 16: + raise ValueError("FileDatesInfo block is too short.") + appledouble_epoch = datetime(2000, 1, 1) + modify_seconds = read_be_32(date_info_data[4:8], signed=True) + return (appledouble_epoch + timedelta(seconds=modify_seconds)).date() + + return None + + +def macfile_timestamp(filepath): + """ + Returns the modification times for the mac file from their finderinfo. + If the file is not a macfile, it returns None + """ + with open(filepath, "rb") as f: + data = f.read() + # Macbinary + if is_macbin(filepath): + return extract_macbin_mtime(data) + + # Appledouble + if is_appledouble_rsrc(filepath) or is_appledouble_in_dot_(filepath) or is_appledouble_in_macosx(filepath): + return extract_mtime_appledouble(data) + + return None + + def validate_date(date_str): """ Confirms if the user provided timestamp is in a valid format. @@ -579,12 +643,15 @@ def filter_files_by_timestamp(files, limit_timestamps_date): """ filtered_file_map = defaultdict(str) + if limit_timestamp_date is not None: user_date = validate_date(limit_timestamps_date) today = date.today() for filepath in files: - mtime = datetime.fromtimestamp(os.path.getmtime(filepath)).date() + mtime = macfile_timestamp(filepath) + if mtime is None: + mtime = datetime.fromtimestamp(os.path.getmtime(filepath)).date() if limit_timestamps_date is None or (limit_timestamps_date is not None and (mtime <= user_date or mtime == today)): filtered_file_map[filepath] = str(mtime) From 8575f8e90b57c84f89b86fb5536b7b902276e50e Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Thu, 10 Jul 2025 19:45:33 +0530 Subject: [PATCH 14/30] INTEGIRTY: Add punycode encoding for scan utlity. --- compute_hash.py | 99 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 19 deletions(-) diff --git a/compute_hash.py b/compute_hash.py index 94bf18b..626e622 100644 --- a/compute_hash.py +++ b/compute_hash.py @@ -18,6 +18,8 @@ class FileType(Enum): script_version = "0.1" +SPECIAL_SYMBOLS = '/":*|\\?%<>\x7f' + # CRC table CRC16_XMODEM_TABLE = [ 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, @@ -75,6 +77,83 @@ def get_dirs_at_depth(directory, depth): if depth == num_sep_this - num_sep: yield root + +def my_escape_string(s: str) -> str: + """ + Escape strings + + Escape the following: + - escape char: \x81 + - unallowed filename chars: https://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words + - control chars < 0x20 + """ + new_name = "" + for char in s: + if char == "\x81": + new_name += "\x81\x79" + elif char in SPECIAL_SYMBOLS or ord(char) < 0x20: + new_name += "\x81" + chr(0x80 + ord(char)) + else: + new_name += char + return new_name + + +def encode_punycode(orig): + """ + Punyencode strings + + - escape special characters and + - ensure filenames can't end in a space or dotif temp == None: + """ + s = my_escape_string(orig) + encoded = s.encode("punycode").decode("ascii") + # punyencoding adds an '-' at the end when there are no special chars + # don't use it for comparing + compare = encoded + if encoded.endswith("-"): + compare = encoded[:-1] + if orig != compare or compare[-1] in " .": + return "xn--" + encoded + return orig + + +def punycode_need_encode(orig): + """ + A filename needs to be punyencoded when it: + + - contains a char that should be escaped or + - ends with a dot or a space. + """ + if len(orig) > 4 and orig[:4] == "xn--": + return False + if not all((0x20 <= ord(c) < 0x80) and c not in SPECIAL_SYMBOLS for c in orig): + return True + if orig[-1] in " .": + return True + return False + + +def split_path_recursive(path): + parts = [] + while True: + head, tail = os.path.split(path) + if tail: + parts.insert(0, tail) + path = head + else: + if head: + parts.insert(0, head) + break + return parts + +def encode_path_components(filepath): + """ + Puny encodes all separate components of filepath + """ + parts = split_path_recursive(filepath) + encoded_parts = [encode_punycode(p) if punycode_need_encode(p) else p for p in parts] + return os.path.join(*encoded_parts) + def read_be_32(byte_stream, signed=False): """ Return unsigned integer of size_in_bits, assuming the data is big-endian """ format = ">i" if signed else ">I" @@ -202,25 +281,6 @@ def macbin_get_datafork(file_byte_stream): (datalen,) = struct.unpack(">I", file_byte_stream[0x53:0x57]) return file_byte_stream[0x80: 0x80 + datalen] -def is_appledouble(file_byte_stream): - """ - Appledouble Structure - - - Header: - +$00 / 4: signature (0x00 0x05 0x16 0x00) - +$04 / 4: version (0x00 0x01 0x00 0x00 (v1) -or- 0x00 0x02 0x00 0x00 (v2)) - +$08 /16: home file system string (v1) -or- zeroes (v2) - +$18 / 2: number of entries - - Entries: - +$00 / 4: entry ID (1-15) - +$04 / 4: offset to data from start of file - +$08 / 4: length of entry in bytes; may be zero - """ - if (not file_byte_stream or read_be_32(file_byte_stream) != 0x00051607): - return False - - return True def appledouble_get_resfork_data(file_byte_stream): """ Returns the resource fork's data section as bytes, size of resource fork (size-r) and size of data section of resource fork (size-rd) of an appledouble file""" @@ -672,6 +732,7 @@ def create_dat_file(hash_of_dirs, path, checksum_size=0): for hash_of_dir in hash_of_dirs: file.write("game (\n") for filename, (hashes, size, size_r, size_rd, timestamp) in hash_of_dir.items(): + filename = encode_path_components(filename) data = f"name \"{filename}\" size {size} size-r {size_r} size-rd {size_rd} timestamp {timestamp}" for key, value in hashes: data += f" {key} {value}" From df41d7afd55239e50f2e2c97fe2ccaa09aa2df53 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Thu, 10 Jul 2025 20:23:56 +0530 Subject: [PATCH 15/30] INTEGRITY: Fix the navbar on top. --- fileset.py | 28 ++++++++++++++-------------- pagination.py | 4 ++-- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fileset.py b/fileset.py index 436652d..c8e9cf7 100644 --- a/fileset.py +++ b/fileset.py @@ -42,12 +42,12 @@ def index(): -
""" html += f"" @@ -476,12 +476,12 @@ def match_fileset_route(id): -
@@ -568,12 +568,12 @@ def merge_fileset(id): -
Fileset ID
""" @@ -778,12 +778,12 @@ def highlight_differences(source, target): -
IDGame NamePlatformLanguageExtraDetailsAction
""" diff --git a/pagination.py b/pagination.py index 28b82f5..091384c 100644 --- a/pagination.py +++ b/pagination.py @@ -141,13 +141,13 @@ def create_page( -
FieldSource FilesetTarget Fileset
+
""" if not results: return "No results for given filters" From c38881cff93d4cdc2e9675c7f1cf162737344ebc Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Mon, 14 Jul 2025 15:55:57 +0530 Subject: [PATCH 16/30] INTEGRITY: Limit match fileset to 1 in remove_manual_merge_if_size_mismatch --- db_functions.py | 111 ++++++++++++++++++------------------------------ 1 file changed, 42 insertions(+), 69 deletions(-) diff --git a/db_functions.py b/db_functions.py index 2e022e3..4c2d927 100644 --- a/db_functions.py +++ b/db_functions.py @@ -1179,7 +1179,7 @@ def set_perform_match( skiplog, ): """ - TODO + "Performs matching for set.dat" """ with conn.cursor() as cursor: if len(candidate_filesets) == 1: @@ -1189,11 +1189,11 @@ def set_perform_match( ) status = cursor.fetchone()["status"] if status == "detection": - update_fileset_status(cursor, matched_fileset_id, "parital") + update_fileset_status(cursor, matched_fileset_id, "partial") set_populate_file(fileset, matched_fileset_id, conn, detection) auto_merged_filesets += 1 if not skiplog: - set_log_matched_fileset( + log_matched_fileset( src, fileset_id, matched_fileset_id, @@ -1247,7 +1247,7 @@ def set_perform_match( set_populate_file(fileset, candidate_fileset, conn, detection) auto_merged_filesets += 1 if not skiplog: - set_log_matched_fileset( + log_matched_fileset( src, fileset_id, candidate_fileset, @@ -1287,63 +1287,37 @@ def remove_manual_merge_if_size_mismatch( cursor.execute(query, (child_fileset,)) files = cursor.fetchall() - for parent_fileset, child_list in manual_merge_map.items(): - if child_fileset not in child_list: - continue - - for file in files: - if file["size"] == -1: - continue - - query = """ - SELECT f.id - FROM fileset fs - JOIN file f ON f.fileset = fs.id - WHERE fs.id = %s - AND f.name = %s - AND f.size = %s - """ - cursor.execute(query, (parent_fileset, file["name"], file["size"])) - result = cursor.fetchall() - - if not result: - remove_manual_merge( - child_fileset, - parent_fileset, - manual_merge_map, - set_to_candidate_dict, - conn, - ) - break - - for parent_fileset, child_list in set_to_candidate_dict.items(): - if child_fileset not in child_list: - continue - - for file in files: - if file["size"] == -1: + for possible_removals in [manual_merge_map, set_to_candidate_dict]: + for parent_fileset, child_list in possible_removals.items(): + if child_fileset not in child_list: continue - query = """ - SELECT f.id - FROM fileset fs - JOIN file f ON f.fileset = fs.id - WHERE fs.id = %s - AND f.name = %s - AND f.size = %s - """ - cursor.execute(query, (parent_fileset, file["name"], file["size"])) - result = cursor.fetchall() - - if not result: - remove_manual_merge( - child_fileset, - parent_fileset, - manual_merge_map, - set_to_candidate_dict, - conn, - ) - break + for file in files: + if file["size"] == -1: + continue + + query = """ + SELECT fs.id + FROM fileset fs + JOIN file f ON f.fileset = fs.id + WHERE fs.id = %s + AND REGEXP_REPLACE(f.name, '^.*[\\\\/]', '') = %s + AND f.size = %s + LIMIT 1 + """ + filename = os.path.basename(normalised_path(file["name"])) + cursor.execute(query, (parent_fileset, filename, file["size"])) + result = cursor.fetchall() + + if not result: + remove_manual_merge( + child_fileset, + parent_fileset, + manual_merge_map, + set_to_candidate_dict, + conn, + ) + break def remove_manual_merge( @@ -2063,21 +2037,20 @@ def insert_new_fileset( cursor.execute("SELECT @file_last AS file_id") file_id = cursor.fetchone()["file_id"] for key, value in file.items(): - if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]: + if key not in [ + "name", + "size", + "size-r", + "size-rd", + "sha1", + "crc", + "modification-time", + ]: insert_filechecksum(file, key, file_id, conn) return (fileset_id, existing) def log_matched_fileset(src, fileset_last, fileset_id, state, user, conn): - category_text = f"Matched from {src}" - log_text = f"Matched Fileset:{fileset_id}. State {state}." - log_last = create_log( - escape_string(category_text), user, escape_string(log_text), conn - ) - update_history(fileset_last, fileset_id, conn, log_last) - - -def set_log_matched_fileset(src, fileset_last, fileset_id, state, user, conn): category_text = f"Matched from {src}" log_text = ( f"Matched Fileset:{fileset_last} with Fileset:{fileset_id}. State {state}." From e86f9822ce22c93d605d2d7fd8144b0ea660e18a Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Mon, 14 Jul 2025 18:50:33 +0530 Subject: [PATCH 17/30] INTEGRITY: Improve console logging with progress update. --- db_functions.py | 112 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 29 deletions(-) diff --git a/db_functions.py b/db_functions.py index 4c2d927..bb6a2a8 100644 --- a/db_functions.py +++ b/db_functions.py @@ -9,11 +9,13 @@ from collections import defaultdict import re import copy +import sys SPECIAL_SYMBOLS = '/":*|\\?%<>\x7f' def db_connect(): + console_log("Connecting to the Database.") base_dir = os.path.dirname(os.path.abspath(__file__)) config_path = os.path.join(base_dir, "mysql_config.json") with open(config_path) as f: @@ -28,7 +30,7 @@ def db_connect(): cursorclass=pymysql.cursors.DictCursor, autocommit=False, ) - + console_log(f"Connected to Database - {mysql_cred['dbname']}") return conn @@ -526,12 +528,17 @@ def db_insert(data_arr, username=None, skiplog=False): transaction_id = temp + 1 category_text = f"Uploaded from {src}" - log_text = f"Started loading DAT file, size {os.path.getsize(filepath)}, author {author}, version {version}. State {status}. Transaction: {transaction_id}" + log_text = f"Started loading DAT file {filepath}, size {os.path.getsize(filepath)}, author {author}, version {version}. State {status}. Transaction: {transaction_id}" user = f"cli:{getpass.getuser()}" if username is None else username create_log(escape_string(category_text), user, escape_string(log_text), conn) + console_log(log_text) + console_log_total_filesets(filepath) + + fileset_count = 1 for fileset in game_data: + console_log_detection(fileset_count) key = calc_key(fileset) megakey = calc_megakey(fileset) @@ -555,7 +562,7 @@ def db_insert(data_arr, username=None, skiplog=False): if existing_entry is not None: log_text = f"Skipping Entry as similar entry already exsits - Fileset:{existing_entry['id']}. Skpped entry details - engineid = {engineid}, gameid = {gameid}, platform = {platform}, language = {lang}" create_log("Warning", user, escape_string(log_text), conn) - print(log_text) + console_log(log_text) continue insert_game( @@ -594,6 +601,8 @@ def db_insert(data_arr, username=None, skiplog=False): if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]: insert_filechecksum(file, key, file_id, conn) + fileset_count += 1 + if detection: conn.cursor().execute( "UPDATE fileset SET status = 'obsolete' WHERE `timestamp` != FROM_UNIXTIME(@fileset_time_last) AND status = 'detection'" @@ -607,6 +616,7 @@ def db_insert(data_arr, username=None, skiplog=False): fileset_insertion_count = cur.fetchone()["COUNT(fileset)"] category_text = f"Uploaded from {src}" log_text = f"Completed loading DAT file, filename {filepath}, size {os.path.getsize(filepath)}, author {author}, version {version}. State {status}. Number of filesets: {fileset_insertion_count}. Transaction: {transaction_id}" + console_log(log_text) except Exception as e: print("Inserting failed:", e) else: @@ -871,8 +881,9 @@ def match_fileset(data_arr, username=None, skiplog=False): transaction_id = transaction_id + 1 if transaction_id else 1 category_text = f"Uploaded from {src}" - log_text = f"Started loading DAT file, size {os.path.getsize(filepath)}, author {author}, version {version}. State {source_status}. Transaction: {transaction_id}" - + log_text = f"Started loading DAT file {filepath}, size {os.path.getsize(filepath)}, author {author}, version {version}. State {source_status}. Transaction: {transaction_id}" + console_log(log_text) + console_log_total_filesets(filepath) user = f"cli:{getpass.getuser()}" if username is None else username create_log(escape_string(category_text), user, escape_string(log_text), conn) @@ -941,6 +952,9 @@ def set_process( mismatch_filesets = 0 dropped_early_no_candidate = 0 dropped_early_single_candidate_multiple_sets = 0 + + fileset_count = 0 + # A mapping from set filesets to candidate filesets list set_to_candidate_dict = defaultdict(list) id_to_fileset_dict = defaultdict(dict) @@ -995,12 +1009,12 @@ def set_process( engine_name = fileset["sourcefile"].split("-")[0] if engine_name == "glk": - candidate_filesets = set_glk_filter_candidate_filesets( - fileset_id, fileset, transaction_id, engine_name, conn + (candidate_filesets, fileset_count) = set_glk_filter_candidate_filesets( + fileset_id, fileset, fileset_count, transaction_id, engine_name, conn ) else: - candidate_filesets = set_filter_candidate_filesets( - fileset_id, fileset, transaction_id, conn + (candidate_filesets, fileset_count) = set_filter_candidate_filesets( + fileset_id, fileset, fileset_count, transaction_id, conn ) # Mac files in set.dat are not represented properly and they won't find a candidate fileset for a match, so we can drop them. @@ -1016,10 +1030,18 @@ def set_process( ) dropped_early_no_candidate += 1 delete_original_fileset(fileset_id, conn) - id_to_fileset_dict[fileset_id] = fileset set_to_candidate_dict[fileset_id].extend(candidate_filesets) + console_message = "Candidate filtering finished." + console_log(console_message) + console_message = ( + f"{dropped_early_no_candidate} Filesets Dropped - No candidates found." + ) + console_log(console_message) + console_message = "Looking for duplicates..." + console_log(console_message) + # Remove all such filesets, which have many to one mapping with a single candidate, those are extra variants. value_to_keys = defaultdict(list) for set_fileset, candidates in set_to_candidate_dict.items(): @@ -1052,6 +1074,7 @@ def set_process( fileset["description"] if "description" in fileset else "" ) log_text = f"Drop fileset, multiple filesets mapping to single detection. Name: {fileset_name}, Description: {fileset_description}. Clashed with Fileset:{candidate} ({engine}:{gameid}-{platform}-{language})" + console_log(log_text) create_log( escape_string(category_text), user, escape_string(log_text), conn ) @@ -1062,7 +1085,9 @@ def set_process( manual_merge_map = defaultdict(list) + match_count = 1 for fileset_id, candidate_filesets in set_to_candidate_dict.items(): + console_log_matching(match_count) fileset = id_to_fileset_dict[fileset_id] # Filter by platform to reduce manual merge @@ -1092,21 +1117,15 @@ def set_process( skiplog, ) - # print(manual_merge_map) + match_count += 1 + console_log("Matching performed.") for fileset_id, candidates in manual_merge_map.items(): category_text = "Manual Merge Required" log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidates)}." manual_merged_filesets += 1 - # print(candidates) add_manual_merge( - candidates, - fileset_id, - category_text, - log_text, - log_text, - user, - conn, + candidates, fileset_id, category_text, log_text, user, conn, log_text ) # Final log @@ -1121,6 +1140,7 @@ def set_process( create_log(escape_string(category_text), user, escape_string(log_text), conn) category_text = "Upload information" log_text = f"Number of filesets: {fileset_insertion_count}. Filesets automatically merged: {auto_merged_filesets}. Filesets dropped early (no candidate) - {dropped_early_no_candidate}. Filesets dropped early (mapping to single detection) - {dropped_early_single_candidate_multiple_sets}. Filesets requiring manual merge: {manual_merged_filesets}. Partial/Full filesets already present: {fully_matched_filesets}. Partial/Full filesets with mismatch {mismatch_filesets}." + console_log(log_text) create_log(escape_string(category_text), user, escape_string(log_text), conn) @@ -1225,14 +1245,13 @@ def set_perform_match( else: category_text = "Mismatch" log_text = f"Fileset:{fileset_id} mismatched with Fileset:{matched_fileset_id} with status:{status}. Try manual merge." - print_text = f"Merge Fileset:{fileset_id} manually with Fileset:{matched_fileset_id}. Unmatched files: {len(unmatched_files)}." + # print_text = f"Merge Fileset:{fileset_id} manually with Fileset:{matched_fileset_id}. Unmatched files: {len(unmatched_files)}." mismatch_filesets += 1 add_manual_merge( [matched_fileset_id], fileset_id, category_text, log_text, - print_text, user, conn, ) @@ -1340,7 +1359,7 @@ def remove_manual_merge( def add_manual_merge( - child_filesets, parent_fileset, category_text, log_text, print_text, user, conn + child_filesets, parent_fileset, category_text, log_text, user, conn, print_text=None ): """ Adds the manual merge entries to a table called possible_merges. @@ -1356,7 +1375,8 @@ def add_manual_merge( cursor.execute(query, (child_fileset, parent_fileset)) create_log(escape_string(category_text), user, escape_string(log_text), conn) - print(print_text) + if print_text: + print(print_text) def is_full_checksum_match(candidate_fileset, fileset, conn): @@ -1395,14 +1415,15 @@ def is_full_checksum_match(candidate_fileset, fileset, conn): def set_glk_filter_candidate_filesets( - fileset_id, fileset, transaction_id, engine_name, conn + fileset_id, fileset, fileset_count, transaction_id, engine_name, conn ): """ Returns a list of candidate filesets for glk engines that can be merged """ with conn.cursor() as cursor: # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches - + fileset_count += 1 + console_log_candidate_filtering(fileset_count) query = """ WITH candidate_fileset AS ( SELECT fs.id AS fileset_id, f.size @@ -1469,16 +1490,19 @@ def set_glk_filter_candidate_filesets( for row in rows: candidates.append(row["fileset_id"]) - return candidates + return (candidates, fileset_count) -def set_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn): +def set_filter_candidate_filesets( + fileset_id, fileset, fileset_count, transaction_id, conn +): """ Returns a list of candidate filesets that can be merged """ with conn.cursor() as cursor: # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches - + fileset_count += 1 + console_log_candidate_filtering(fileset_count) query = """ WITH candidate_fileset AS ( SELECT fs.id AS fileset_id, f.name, f.size @@ -1536,7 +1560,7 @@ def set_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn): for row in rows: candidates.append(row["fileset_id"]) - return candidates + return (candidates, fileset_count) def process_fileset( @@ -2265,3 +2289,33 @@ def add_usercount(fileset, conn): cursor.execute( f"UPDATE fileset SET status = 'ReadyForReview' WHERE id = {fileset}" ) + + +def console_log(message): + sys.stdout.write(" " * 50 + "\r") + sys.stdout.flush() + print(message) + + +def console_log_candidate_filtering(fileset_count): + sys.stdout.write(f"Filtering Candidates - Fileset {fileset_count}\r") + sys.stdout.flush() + + +def console_log_matching(fileset_count): + sys.stdout.write(f"Performing Match - Fileset {fileset_count}\r") + sys.stdout.flush() + + +def console_log_detection(fileset_count): + sys.stdout.write(f"Processing - Fileset {fileset_count}\r") + sys.stdout.flush() + + +def console_log_total_filesets(file_path): + count = 0 + with open(file_path, "r") as f: + for line in f: + if line.strip().startswith("game ("): + count += 1 + print(f"Total filesets present - {count}.") From 7dcb20ba4ee3a947c27e28cf02c87d73f2db9731 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Tue, 15 Jul 2025 02:45:15 +0530 Subject: [PATCH 18/30] INTEGRITY: Remove custom recursive path split function. --- compute_hash.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/compute_hash.py b/compute_hash.py index 626e622..d63b22f 100644 --- a/compute_hash.py +++ b/compute_hash.py @@ -132,25 +132,11 @@ def punycode_need_encode(orig): return True return False - -def split_path_recursive(path): - parts = [] - while True: - head, tail = os.path.split(path) - if tail: - parts.insert(0, tail) - path = head - else: - if head: - parts.insert(0, head) - break - return parts - def encode_path_components(filepath): """ Puny encodes all separate components of filepath """ - parts = split_path_recursive(filepath) + parts = [i for i in filepath.split(os.sep) if i ] encoded_parts = [encode_punycode(p) if punycode_need_encode(p) else p for p in parts] return os.path.join(*encoded_parts) @@ -733,7 +719,7 @@ def create_dat_file(hash_of_dirs, path, checksum_size=0): file.write("game (\n") for filename, (hashes, size, size_r, size_rd, timestamp) in hash_of_dir.items(): filename = encode_path_components(filename) - data = f"name \"{filename}\" size {size} size-r {size_r} size-rd {size_rd} timestamp {timestamp}" + data = f"name '{filename}' size {size} size-r {size_r} size-rd {size_rd} modification-time {timestamp}" for key, value in hashes: data += f" {key} {value}" From 96d9cf473ca63bb83de3b4086efd105112ea46b3 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Tue, 15 Jul 2025 02:49:18 +0530 Subject: [PATCH 19/30] INTEGRITY: Use INFORMATION_SCHEMA.COLUMNS instead of relying on error handling for column migration. --- schema.py | 132 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 79 insertions(+), 53 deletions(-) diff --git a/schema.py b/schema.py index 4b9ba42..9bf42ee 100644 --- a/schema.py +++ b/schema.py @@ -164,59 +164,85 @@ def init_database(): "file_fileset_detection": "CREATE INDEX file_fileset_detection ON file (fileset, detection)", } - try: - cursor.execute("ALTER TABLE file ADD COLUMN detection_type VARCHAR(20);") - except Exception: - # if aleady exists, change the length of the column - cursor.execute("ALTER TABLE file MODIFY COLUMN detection_type VARCHAR(20);") - - try: - cursor.execute("ALTER TABLE file ADD COLUMN `timestamp` TIMESTAMP NOT NULL;") - except Exception: - # if aleady exists, change the length of the column - cursor.execute("ALTER TABLE file MODIFY COLUMN `timestamp` TIMESTAMP NOT NULL;") - - try: - cursor.execute("ALTER TABLE fileset ADD COLUMN `user_count` INT;") - except Exception: - # if aleady exists, change the length of the column - cursor.execute("ALTER TABLE fileset MODIFY COLUMN `user_count` INT;") - - try: - cursor.execute("ALTER TABLE file ADD COLUMN punycode_name VARCHAR(200);") - except Exception: - cursor.execute("ALTER TABLE file MODIFY COLUMN punycode_name VARCHAR(200);") - - try: - cursor.execute( - "ALTER TABLE file ADD COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';" - ) - except Exception: - cursor.execute( - "ALTER TABLE file MODIFY COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';" - ) - - try: - cursor.execute( - "ALTER TABLE file ADD COLUMN `size-r` BIGINT DEFAULT 0, ADD COLUMN `size-rd` BIGINT DEFAULT 0;" - ) - except Exception: - cursor.execute( - "ALTER TABLE file MODIFY COLUMN `size-r` BIGINT DEFAULT 0, MODIFY COLUMN `size-rd` BIGINT DEFAULT 0;" - ) - try: - cursor.execute("ALTER TABLE log ADD COLUMN `text` varchar(5000);") - except Exception: - cursor.execute("ALTER TABLE log MODIFY COLUMN `text` varchar(5000);") - - try: - cursor.execute( - "ALTER TABLE fileset ADD COLUMN set_dat_metadata varchar(5000) DEFAULT 'UTF-8';" - ) - except Exception: - cursor.execute( - "ALTER TABLE fileset MODIFY COLUMN set_dat_metadata varchar(5000) DEFAULT 'UTF-8';" - ) + def migrate_column(cursor, table_name, column_name, add_sql, modify_sql): + query = """ + SELECT COUNT(*) AS count + FROM INFORMATION_SCHEMA.COLUMNS + WHERE table_name = %s AND column_name = %s + """ + cursor.execute(query, (table_name, column_name)) + exists = cursor.fetchone()["count"] > 0 + + if exists: + print(f"Modifying column '{column_name}' in table '{table_name}'") + cursor.execute(modify_sql) + else: + print(f"Adding column '{column_name}' to table '{table_name}'") + cursor.execute(add_sql) + + migrate_column( + cursor, + "file", + "detection_type", + "ALTER TABLE file ADD COLUMN detection_type VARCHAR(20);", + "ALTER TABLE file MODIFY COLUMN detection_type VARCHAR(20);", + ) + + migrate_column( + cursor, + "file", + "timestamp", + "ALTER TABLE file ADD COLUMN `timestamp` TIMESTAMP NOT NULL;", + "ALTER TABLE file MODIFY COLUMN `timestamp` TIMESTAMP NOT NULL;", + ) + + migrate_column( + cursor, + "fileset", + "user_count", + "ALTER TABLE fileset ADD COLUMN `user_count` INT;", + "ALTER TABLE fileset MODIFY COLUMN `user_count` INT;", + ) + + migrate_column( + cursor, + "file", + "punycode_name", + "ALTER TABLE file ADD COLUMN punycode_name VARCHAR(200);", + "ALTER TABLE file MODIFY COLUMN punycode_name VARCHAR(200);", + ) + + migrate_column( + cursor, + "file", + "encoding_type", + "ALTER TABLE file ADD COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';", + "ALTER TABLE file MODIFY COLUMN encoding_type VARCHAR(20) DEFAULT 'UTF-8';", + ) + + migrate_column( + cursor, + "file", + "size-r", + "ALTER TABLE file ADD COLUMN `size-r` BIGINT DEFAULT 0;", + "ALTER TABLE file MODIFY COLUMN `size-r` BIGINT DEFAULT 0;", + ) + + migrate_column( + cursor, + "file", + "size-rd", + "ALTER TABLE file ADD COLUMN `size-rd` BIGINT DEFAULT 0;", + "ALTER TABLE file MODIFY COLUMN `size-rd` BIGINT DEFAULT 0;", + ) + + migrate_column( + cursor, + "log", + "text", + "ALTER TABLE log ADD COLUMN `text` VARCHAR(5000);", + "ALTER TABLE log MODIFY COLUMN `text` VARCHAR(5000);", + ) for index, definition in indices.items(): try: From 4c9a5e7d992a254df2e3bb8999e1df6abb640e70 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Tue, 15 Jul 2025 02:57:57 +0530 Subject: [PATCH 20/30] INTEGRITY: Add scan processing logic. --- db_functions.py | 658 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 658 insertions(+) diff --git a/db_functions.py b/db_functions.py index bb6a2a8..683fb63 100644 --- a/db_functions.py +++ b/db_functions.py @@ -902,6 +902,21 @@ def match_fileset(data_arr, username=None, skiplog=False): user, skiplog, ) + elif src == "scan": + scan_process( + game_data, + resources, + detection, + src, + conn, + transaction_id, + filepath, + author, + version, + source_status, + user, + skiplog, + ) else: game_data_lookup = {fs["name"]: fs for fs in game_data} for fileset in game_data: @@ -924,6 +939,628 @@ def match_fileset(data_arr, username=None, skiplog=False): ) +def scan_process( + game_data, + resources, + detection, + src, + conn, + transaction_id, + filepath, + author, + version, + source_status, + user, + skiplog, +): + """ + Entry point for processing logic for scan.dat. + First Pass - Update all files with matching checksum and file size. + Second Pass - Filter candidate with matching with filename, filesize and filechecksum + - Perform matching. + """ + + manual_merged_filesets = 0 + automatic_merged_filesets = 0 + match_with_full_fileset = 0 + mismatch_with_full_fileset = 0 + dropped_early_no_candidate = 0 + manual_merged_with_detection = 0 + filesets_with_missing_files = 0 + + id_to_fileset_mapping = defaultdict(dict) + + for fileset in game_data: + key = calc_key(fileset) + megakey = "" + log_text = f"State {source_status}." + + (fileset_id, existing) = insert_new_fileset( + fileset, + conn, + detection, + src, + key, + megakey, + transaction_id, + log_text, + user, + skiplog=skiplog, + ) + if existing: + continue + + id_to_fileset_mapping[fileset_id] = fileset + + # set of filesets whose files got updated + filesets_check_for_full = set() + + for rom in fileset["rom"]: + scan_update_files(rom, filesets_check_for_full, transaction_id, conn) + + for fileset_id, fileset in id_to_fileset_mapping.items(): + candidate_filesets = scan_filter_candidate_filesets( + fileset_id, fileset, transaction_id, conn + ) + + if len(candidate_filesets) == 0: + category_text = "Drop fileset - No Candidates" + fileset_name = fileset["name"] if "name" in fileset else "" + fileset_description = ( + fileset["description"] if "description" in fileset else "" + ) + log_text = f"Drop fileset as no matching candidates. Name: {fileset_name}, Description: {fileset_description}." + create_log( + escape_string(category_text), user, escape_string(log_text), conn + ) + dropped_early_no_candidate += 1 + delete_original_fileset(fileset_id, conn) + continue + + ( + automatic_merged_filesets, + manual_merged_filesets, + match_with_full_fileset, + mismatch_with_full_fileset, + manual_merged_with_detection, + filesets_with_missing_files, + ) = scan_perform_match( + fileset, + src, + user, + fileset_id, + detection, + candidate_filesets, + automatic_merged_filesets, + manual_merged_filesets, + match_with_full_fileset, + mismatch_with_full_fileset, + manual_merged_with_detection, + filesets_with_missing_files, + conn, + skiplog, + ) + + # Final log + with conn.cursor() as cursor: + cursor.execute( + "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s", + (transaction_id,), + ) + fileset_insertion_count = cursor.fetchone()["COUNT(fileset)"] + category_text = f"Uploaded from {src}" + log_text = f"Completed loading DAT file, filename {filepath}, size {os.path.getsize(filepath)}. State {source_status}. Number of filesets: {fileset_insertion_count}. Transaction: {transaction_id}" + create_log(escape_string(category_text), user, escape_string(log_text), conn) + category_text = "Upload information" + log_text = f"Number of filesets: {fileset_insertion_count}. Filesets automatically merged: {automatic_merged_filesets}. Filesets requiring manual merge (multiple candidates): {manual_merged_filesets}. Filesets requiring manual merge (matched with detection): {manual_merged_with_detection}. Filesets dropped, no candidate: {dropped_early_no_candidate}. Filesets matched with existing Full fileset: {match_with_full_fileset}. Filesets with mismatched files with Full fileset: {mismatch_with_full_fileset}. Filesets missing files compared to partial fileset candidate: {filesets_with_missing_files}." + create_log(escape_string(category_text), user, escape_string(log_text), conn) + + +def scan_update_files(rom, filesets_check_for_full, transaction_id, conn): + """ + Updates all the checksums for the files matching by a checksum and size. + """ + with conn.cursor() as cursor: + checksums = defaultdict(str) + for key in rom: + if key not in ["name", "size", "size-r", "size-rd", "modification-time"]: + checksums[key] = rom[key] + + files_to_update = set() + + for _, checksum in checksums.items(): + query = """ + SELECT f.id as file_id, fs.id as fileset_id + FROM file f + JOIN filechecksum fc ON fc.file = f.id + JOIN fileset fs ON fs.id = f.fileset + JOIN transactions t ON t.fileset = fs.id + WHERE fc.checksum = %s + AND f.size = %s + AND f.`size-r` = %s + AND f.`size-rd` = %s + AND t.transaction != %s + """ + size = rom["size"] if "size" in rom else 0 + size_r = rom["size-r"] if "size-r" in rom else 0 + size_rd = rom["size-rd"] if "size-rd" in rom else 0 + cursor.execute(query, (checksum, size, size_r, size_rd, transaction_id)) + result = cursor.fetchall() + if result: + for file in result: + filesets_check_for_full.add(file["fileset_id"]) + files_to_update.add(file["file_id"]) + + for file_id in files_to_update: + query = """ + DELETE FROM filechecksum + WHERE file = %s + """ + cursor.execute(query, (file_id,)) + for check, checksum in checksums.items(): + checksize, checktype, checksum = get_checksum_props(check, checksum) + query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)" + cursor.execute(query, (file_id, checksize, checktype, checksum)) + + conn.commit() + + +def scan_perform_match( + fileset, + src, + user, + fileset_id, + detection, + candidate_filesets, + automatic_merged_filesets, + manual_merged_filesets, + match_with_full_fileset, + mismatch_with_full_fileset, + manual_merged_with_detection, + filesets_with_missing_files, + conn, + skiplog, +): + """ + Performs matching for scan.dat. + If single candidate for match: + detection -> Copy all the files and checksums from scan. + partial -> Copy all the files and checksums from scan. + full -> Drop the scan fileset. But show the differences in file if any. + If more than one candidate for match: + Put them for manual merge. + """ + with conn.cursor() as cursor: + if len(candidate_filesets) == 1: + matched_fileset_id = candidate_filesets[0] + cursor.execute( + "SELECT status FROM fileset WHERE id = %s", (matched_fileset_id,) + ) + status = cursor.fetchone()["status"] + # Partial filesets can be turned full directly, as the files have already been updated. + # But the files that had missing size were not updated, so we need to check. + if status == "partial": + # Partial filesets contain all the files, so does the scanned filesets, so this case should not ideally happen. + if total_files(matched_fileset_id, conn) > total_fileset_files(fileset): + category_text = "Missing files" + log_text = f"Missing files in Fileset:{fileset_id}. Try manual merge with Fileset:{matched_fileset_id}." + add_manual_merge( + candidate_filesets, + fileset_id, + category_text, + log_text, + user, + conn, + log_text, + ) + filesets_with_missing_files += 1 + + else: + update_all_files(fileset, matched_fileset_id, False, conn) + update_fileset_status(cursor, matched_fileset_id, "full") + if not skiplog: + log_matched_fileset( + src, + fileset_id, + matched_fileset_id, + "full", + user, + conn, + ) + delete_original_fileset(fileset_id, conn) + automatic_merged_filesets += 1 + + # Detection filests can be turned full if the number of files are equal, + # otherwise we do manual merge to remove extra files. + elif status == "detection": + if total_fileset_files(fileset) == total_files( + matched_fileset_id, conn, detection_only=True + ): + update_all_files(fileset, matched_fileset_id, True, conn) + update_fileset_status(cursor, matched_fileset_id, "full") + if not skiplog: + log_matched_fileset( + src, + fileset_id, + matched_fileset_id, + "full", + user, + conn, + ) + delete_original_fileset(fileset_id, conn) + automatic_merged_filesets += 1 + + else: + category_text = "Manual Merge - Detection found" + log_text = f"Matched with detection. Merge Fileset:{fileset_id} manually with Fileset:{matched_fileset_id}." + add_manual_merge( + candidate_filesets, + fileset_id, + category_text, + log_text, + user, + conn, + log_text, + ) + manual_merged_with_detection += 1 + + # Drop the fileset, note down the file differences + elif status == "full": + (unmatched_candidate_files, unmatched_scan_files) = get_unmatched_files( + matched_fileset_id, fileset, conn + ) + fully_matched = ( + True + if len(unmatched_candidate_files) == 0 + and len(unmatched_scan_files) == 0 + else False + ) + if fully_matched: + match_with_full_fileset += 1 + else: + mismatch_with_full_fileset += 1 + log_scan_match_with_full( + fileset_id, + matched_fileset_id, + unmatched_candidate_files, + unmatched_scan_files, + fully_matched, + user, + conn, + ) + delete_original_fileset(fileset_id, conn) + + elif len(candidate_filesets) > 1: + category_text = "Manual Merge - Multiple Candidates" + log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidate_filesets)}." + manual_merged_filesets += 1 + add_manual_merge( + candidate_filesets, + fileset_id, + category_text, + log_text, + user, + conn, + log_text, + ) + + return ( + automatic_merged_filesets, + manual_merged_filesets, + match_with_full_fileset, + mismatch_with_full_fileset, + manual_merged_with_detection, + filesets_with_missing_files, + ) + + +def update_all_files(fileset, candidate_fileset_id, is_candidate_detection, conn): + """ + Updates all the files, if they were missed out earlier due to missing size. + """ + with conn.cursor() as cursor: + # Extracting the filename from the filepath. + cursor.execute( + f"SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = {candidate_fileset_id}" + ) + target_files = cursor.fetchall() + candidate_files = { + target_file["id"]: target_file["name"].lower() + for target_file in target_files + } + + scan_checksums = set() + scan_names_by_checksum = defaultdict(str) + same_filename_count = defaultdict(int) + + filename_to_filepath_map = defaultdict(str) + filepath_to_checksum_map = defaultdict(dict) + filepath_to_sizes_map = defaultdict(dict) + + for file in fileset["rom"]: + base_name = os.path.basename(normalised_path(file["name"])).lower() + checksums = defaultdict(str) + sizes = defaultdict(int) + for key in file: + if key.startswith("md5"): + scan_checksums.add((file[key], base_name)) + scan_names_by_checksum[(file[key], base_name)] = file["name"] + checksums[key] = file[key] + if key.startswith("size"): + sizes[key] = file[key] + + filepath_to_sizes_map[file["name"]] = sizes + filepath_to_checksum_map[file["name"]] = checksums + same_filename_count[base_name] += 1 + filename_to_filepath_map[base_name] = file["name"] + + checksums = defaultdict(dict) + filepath = "" + + for file_id, file_name in candidate_files.items(): + file_name = file_name.lower() + # Match by filename + if same_filename_count[file_name] == 1: + filepath = filename_to_filepath_map[file_name] + checksums = filepath_to_checksum_map[filepath] + + # If same filename occurs multiple times, fallback to checksum based match + else: + cursor.execute( + "SELECT checksum FROM filechecksum WHERE file = %s", (file_id,) + ) + checksum_rows = cursor.fetchall() + for row in checksum_rows: + checksum = row["checksum"] + if (checksum, file_name) in scan_checksums: + filepath = scan_names_by_checksum[(checksum, file_name)] + checksums = filepath_to_checksum_map[filepath] + + # Delete older checksums + query = """ + DELETE FROM filechecksum + WHERE file = %s + """ + cursor.execute(query, (file_id,)) + # Update the checksums + for key, checksum in checksums.items(): + checksize, checktype, checksum = get_checksum_props(key, checksum) + query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)" + cursor.execute(query, (file_id, checksize, checktype, checksum)) + + # Also updates the sizes, do not update the name if fileset not in detection state + query = """ + UPDATE file + SET size = %s, + `size-r` = %s, + `size-rd` = %s + """ + sizes = filepath_to_sizes_map[filepath] + print(sizes) + if is_candidate_detection: + query += ",name = %s WHERE id = %s" + params = ( + sizes["size"], + sizes["size-r"], + sizes["size-rd"], + normalised_path(filepath), + file_id, + ) + else: + query += "WHERE id = %s" + params = (sizes["size"], sizes["size-r"], sizes["size-rd"], file_id) + cursor.execute(query, params) + + +def total_files(fileset_id, conn, detection_only=False): + """ + Returns the total number of files (only detection files if detection_only set to true) present in the given fileset from the database. + """ + with conn.cursor() as cursor: + query = """ + SELECT COUNT(*) AS count + FROM file f + JOIN fileset fs ON fs.id = f.fileset + """ + if detection_only: + query += """ + WHERE f.detection = 1 + AND fs.id = %s + """ + else: + query += "WHERE fs.id = %s" + cursor.execute(query, (fileset_id,)) + return cursor.fetchone()["count"] + + +def total_fileset_files(fileset): + """ + Returns the number of files present in the fileset + """ + return len(fileset["rom"]) + + +def scan_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn): + """ + Returns a list of candidate filesets that can be merged + """ + with conn.cursor() as cursor: + # Returns those filesets which have all detection files matching in the scan fileset filtered by file name and file size(if not -1). + + query = """ + WITH candidate_fileset AS ( + SELECT fs.id AS fileset_id, f.name, f.size, + f.`size-r` AS size_r, f.`size-rd` AS size_rd + FROM file f + JOIN fileset fs ON f.fileset = fs.id + JOIN game g ON g.id = fs.game + JOIN transactions t ON t.fileset = fs.id + WHERE f.detection = 1 + AND t.transaction != %s + ), + total_detection_files AS ( + SELECT cf.fileset_id, COUNT(*) AS detection_files_found + FROM candidate_fileset cf + GROUP BY fileset_id + ), + set_fileset AS ( + SELECT name, size, + `size-r` AS size_r, `size-rd` AS size_rd + FROM file + WHERE fileset = %s + ), + matched_detection_files AS ( + SELECT cf.fileset_id, COUNT(*) AS match_files_count + FROM candidate_fileset cf + JOIN set_fileset sf ON ( ( + cf.name = sf.name + OR + REGEXP_REPLACE(cf.name, '^.*[\\\\/]', '') = REGEXP_REPLACE(sf.name, '^.*[\\\\/]', '') + ) AND (cf.size = sf.size OR cf.size = -1) + AND (cf.size_r = sf.size_r) + AND (cf.size_rd = sf.size_rd)) + GROUP BY cf.fileset_id + ), + valid_matched_detection_files AS ( + SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count + FROM matched_detection_files mdf + JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id + WHERE tdf.detection_files_found <= mdf.match_files_count + ), + max_match_count AS ( + SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files + ) + SELECT vmdf.fileset_id + FROM valid_matched_detection_files vmdf + JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id + JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count + """ + + cursor.execute(query, (transaction_id, fileset_id)) + rows = cursor.fetchall() + + candidates = [] + if rows: + for row in rows: + candidates.append(row["fileset_id"]) + + for candidate in candidates: + if not is_full_detection_checksum_match(candidate, fileset, conn): + candidates.remove(candidate) + + return candidates + + +def get_unmatched_files(candidate_fileset, fileset, conn): + """ + Checks if all checksums from candidate_fileset match scan file checksums. + Returns: + unmatched_candidate_files: candidate files whose checksums weren't found in scan + unmatched_scan_files: scan files whose checksums weren't matched by candidate + """ + with conn.cursor() as cursor: + cursor.execute( + "SELECT id, name FROM file WHERE fileset = %s", (candidate_fileset,) + ) + candidate_file_rows = cursor.fetchall() + candidate_files = {row["id"]: row["name"] for row in candidate_file_rows} + + scan_checksums = set() + scan_names_by_checksum = {} + + for file in fileset["rom"]: + base_name = os.path.basename(normalised_path(file["name"])).lower() + for key in file: + if key.startswith("md5"): + scan_checksums.add((file[key], base_name)) + scan_names_by_checksum[(file[key], base_name)] = file["name"] + + unmatched_candidate_files = [] + matched_scan_pairs = set() + + for file_id, file_name in candidate_files.items(): + cursor.execute( + "SELECT checksum FROM filechecksum WHERE file = %s", (file_id,) + ) + checksum_rows = cursor.fetchall() + + base_name = os.path.basename(file_name).lower() + match_found = False + + for row in checksum_rows: + checksum = row["checksum"] + if (checksum, base_name) in scan_checksums: + matched_scan_pairs.add((checksum, base_name)) + match_found = True + + if not match_found: + unmatched_candidate_files.append(file_name) + + unmatched_scan_files = { + scan_names_by_checksum[key] + for key in scan_checksums + if key not in matched_scan_pairs + } + unmatched_scan_files = list(unmatched_scan_files) + + return (unmatched_candidate_files, unmatched_scan_files) + + +def is_full_detection_checksum_match(candidate_fileset, fileset, conn): + """ + Return type - Boolean + Checks if all the detection files in the candidate fileset have corresponding checksums matching with scan. + + scan - rom ( name "AFM Read Me!_2" size 8576 size-r 1 size-rd 0 modification-time 1993-05-12 md5 dsd16ccea050db521a678a1cdc33794c md5-5000 008e76ec3ae58d0add637ea7aa299a2a md5-t-5000 118e76ec3ae58d0add637ea7aa299a2c md5-1048576 37d16ccea050db521a678a1cdc33794c) + """ + with conn.cursor() as cursor: + cursor.execute( + "SELECT id, name FROM file WHERE detection=1 AND fileset = %s", + (candidate_fileset,), + ) + target_files = cursor.fetchall() + candidate_files = { + target_file["id"]: target_file["name"] for target_file in target_files + } + + # set of (checksum, filename) + scan_checksums = set() + for file in fileset["rom"]: + for key in file: + if key.startswith("md5"): + name = os.path.basename(normalised_path(file["name"])) + scan_checksums.add((file[key], name.lower())) + + for detection_file_id, detection_file_name in candidate_files.items(): + query = """ + SELECT fc.checksum, fc.checksize, fc.checktype + FROM filechecksum fc + WHERE fc.file = %s + """ + cursor.execute(query, (detection_file_id,)) + checksums_info = cursor.fetchall() + match_found = False + if checksums_info: + for checksum_info in checksums_info: + checksum = checksum_info["checksum"] + if ( + checksum, + os.path.basename(detection_file_name.lower()), + ) not in scan_checksums: + match_found = True + break + + if match_found: + return False + + return True + + +# ------------------------------------------------------------------------------------------------------- +# Set.dat processing below +# ------------------------------------------------------------------------------------------------------- + + def set_process( game_data, resources, @@ -2085,6 +2722,27 @@ def log_matched_fileset(src, fileset_last, fileset_id, state, user, conn): update_history(fileset_last, fileset_id, conn, log_last) +def log_scan_match_with_full( + fileset_last, + candidate_id, + unmatched_candidate_files, + unmatched_scan_files, + fully_matched, + user, + conn, +): + category_text = "Mismatch with Full set" + if fully_matched: + category_text = "Existing as Full set." + log_text = f"""Files mismatched with Full Fileset:{candidate_id}. Unmatched Files in scan fileset = {len(unmatched_scan_files)}. Unmatched Files in full fileset = {len(unmatched_candidate_files)}. List of unmatched files scan.dat : {", ".join(scan_file for scan_file in unmatched_scan_files)}, List of unmatched files full fileset : {", ".join(scan_file for scan_file in unmatched_candidate_files)}""" + if fully_matched: + log_text = ( + f"Fileset matched completely with Full Fileset:{candidate_id}. Dropping." + ) + print(log_text) + create_log(escape_string(category_text), user, escape_string(log_text), conn) + + def finalize_fileset_insertion( conn, transaction_id, src, filepath, author, version, source_status, user ): From 33cac5a068e6c0360c37c34a46afaa7ac2da9447 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Tue, 15 Jul 2025 03:02:14 +0530 Subject: [PATCH 21/30] INTEGRITY: Add additional modification-time column in file table. --- dat_parser.py | 3 +++ db_functions.py | 7 +++++-- fileset.py | 9 +++------ schema.py | 16 ++++++++++++++++ 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/dat_parser.py b/dat_parser.py index b3ce12e..a76480b 100644 --- a/dat_parser.py +++ b/dat_parser.py @@ -33,6 +33,9 @@ def map_checksum_data(content_string): elif tokens[i] == "size-rd": current_rom["size-rd"] = int(tokens[i + 1]) i += 2 + elif tokens[i] == "modification-time": + current_rom["modification-time"] = tokens[i + 1] + i += 2 else: checksum_key = tokens[i] checksum_value = tokens[i + 1] if len(tokens) >= 6 else "0" diff --git a/db_functions.py b/db_functions.py index 683fb63..408ea29 100644 --- a/db_functions.py +++ b/db_functions.py @@ -234,13 +234,16 @@ def insert_file(file, detection, src, conn): values.append(file["size-r"] if "size-r" in file else "0") values.append(file["size-rd"] if "size-rd" in file else "0") + modification_time = file["modification-time"] if "modification-time" in file else "" + values.append(modification_time) + values.extend([checksum, detection, detection_type]) # Parameterised Query placeholders = ( - ["%s"] * (len(values[:5])) + ["@fileset_last"] + ["%s"] * 2 + ["NOW()"] + ["%s"] * (len(values[:6])) + ["@fileset_last"] + ["%s"] * 2 + ["NOW()"] ) - query = f"INSERT INTO file ( name, size, `size-r`, `size-rd`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ({', '.join(placeholders)})" + query = f"INSERT INTO file ( name, size, `size-r`, `size-rd`, `modification-time`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ({', '.join(placeholders)})" with conn.cursor() as cursor: cursor.execute(query, values) diff --git a/fileset.py b/fileset.py index c8e9cf7..a930f54 100644 --- a/fileset.py +++ b/fileset.py @@ -189,7 +189,6 @@ def fileset(): query = """SELECT game.name as 'game name', engineid, gameid, extra, platform, language, fileset.set_dat_metadata FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = %s""" else: query = """SELECT game.name as 'game name', engineid, gameid, extra, platform, language FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = %s""" - print(query) cursor.execute(query, (id,)) result = {**result, **cursor.fetchone()} else: @@ -240,6 +239,7 @@ def fileset(): "detection", "detection_type", "timestamp", + "modification-time", ] if sort: @@ -250,13 +250,10 @@ def fileset(): if "desc" in sort: order += " DESC" - columns_to_select = "file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp`" + columns_to_select = "file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp`, `modification-time`" columns_to_select += ", ".join(md5_columns) - print( - f"SELECT file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp` FROM file WHERE fileset = {id} {order}" - ) cursor.execute( - f"SELECT file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp` FROM file WHERE fileset = {id} {order}" + f"SELECT file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp`, `modification-time` FROM file WHERE fileset = {id} {order}" ) result = cursor.fetchall() diff --git a/schema.py b/schema.py index 9bf42ee..776eada 100644 --- a/schema.py +++ b/schema.py @@ -244,6 +244,22 @@ def migrate_column(cursor, table_name, column_name, add_sql, modify_sql): "ALTER TABLE log MODIFY COLUMN `text` VARCHAR(5000);", ) + migrate_column( + cursor, + "fileset", + "set_dat_metadata", + "ALTER TABLE fileset ADD COLUMN set_dat_metadata VARCHAR(5000) DEFAULT '';", + "ALTER TABLE fileset MODIFY COLUMN set_dat_metadata VARCHAR(5000) DEFAULT '';", + ) + + migrate_column( + cursor, + "file", + "modification-time", + "ALTER TABLE file ADD COLUMN `modification-time` VARCHAR(100) DEFAULT '';", + "ALTER TABLE file MODIFY COLUMN `modification-time` VARCHAR(100) DEFAULT '';", + ) + for index, definition in indices.items(): try: cursor.execute(definition) From 90ffe1a37ec8d1de4311c4cefd6cb248a4e78ac6 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Tue, 15 Jul 2025 03:04:24 +0530 Subject: [PATCH 22/30] INTEGRITY: Additional error handling while extracing keys from scummvm.dat. --- db_functions.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/db_functions.py b/db_functions.py index 408ea29..f9db3bc 100644 --- a/db_functions.py +++ b/db_functions.py @@ -546,13 +546,19 @@ def db_insert(data_arr, username=None, skiplog=False): megakey = calc_megakey(fileset) if detection: - engine_name = fileset["engine"] - engineid = fileset["sourcefile"] - gameid = fileset["name"] - title = fileset["title"] - extra = fileset["extra"] - platform = fileset["platform"] - lang = fileset["language"] + try: + engine_name = fileset.get("engine", "") + engineid = fileset["sourcefile"] + gameid = fileset["name"] + title = fileset.get("title", "") + extra = fileset.get("extra", "") + platform = fileset.get("platform", "") + lang = fileset.get("language", "") + except KeyError as e: + print( + f"Missing key in header: {e} for {fileset.get('name', '')}-{fileset.get('language', '')}-{fileset.get('platform', '')}" + ) + return with conn.cursor() as cursor: query = """ From 493acb524bc3fcff22c05f6985796d8ff37757ac Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Tue, 15 Jul 2025 04:29:37 +0530 Subject: [PATCH 23/30] INTEGRITY: Traverse set.dat instead of candidate fileset while searching mismatched files. --- db_functions.py | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/db_functions.py b/db_functions.py index f9db3bc..6906a64 100644 --- a/db_functions.py +++ b/db_functions.py @@ -281,8 +281,6 @@ def add_all_equal_checksums(checksize, checktype, checksum, file_id, conn): size_name = "size" if checktype[-1] == "r": size_name += "-rd" - if checktype[-1] == "s": - size_name += "-d" cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = {file_id}") result = cursor.fetchone() @@ -1345,7 +1343,6 @@ def update_all_files(fileset, candidate_fileset_id, is_candidate_detection, conn `size-rd` = %s """ sizes = filepath_to_sizes_map[filepath] - print(sizes) if is_candidate_detection: query += ",name = %s WHERE id = %s" params = ( @@ -1462,10 +1459,10 @@ def scan_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn): def get_unmatched_files(candidate_fileset, fileset, conn): """ - Checks if all checksums from candidate_fileset match scan file checksums. + Checks if all checksums from candidate_fileset match dat file checksums. Returns: unmatched_candidate_files: candidate files whose checksums weren't found in scan - unmatched_scan_files: scan files whose checksums weren't matched by candidate + unmatched_dat_files: dat files whose checksums weren't matched by candidate """ with conn.cursor() as cursor: cursor.execute( @@ -1474,18 +1471,18 @@ def get_unmatched_files(candidate_fileset, fileset, conn): candidate_file_rows = cursor.fetchall() candidate_files = {row["id"]: row["name"] for row in candidate_file_rows} - scan_checksums = set() - scan_names_by_checksum = {} + dat_checksums = set() + dat_names_by_checksum = {} for file in fileset["rom"]: base_name = os.path.basename(normalised_path(file["name"])).lower() for key in file: if key.startswith("md5"): - scan_checksums.add((file[key], base_name)) - scan_names_by_checksum[(file[key], base_name)] = file["name"] + dat_checksums.add((file[key], base_name)) + dat_names_by_checksum[(file[key], base_name)] = file["name"] unmatched_candidate_files = [] - matched_scan_pairs = set() + matched_dat_pairs = set() for file_id, file_name in candidate_files.items(): cursor.execute( @@ -1498,21 +1495,21 @@ def get_unmatched_files(candidate_fileset, fileset, conn): for row in checksum_rows: checksum = row["checksum"] - if (checksum, base_name) in scan_checksums: - matched_scan_pairs.add((checksum, base_name)) + if (checksum, base_name) in dat_checksums: + matched_dat_pairs.add((checksum, base_name)) match_found = True if not match_found: unmatched_candidate_files.append(file_name) - unmatched_scan_files = { - scan_names_by_checksum[key] - for key in scan_checksums - if key not in matched_scan_pairs + unmatched_dat_files = { + dat_names_by_checksum[key] + for key in dat_checksums + if key not in matched_dat_pairs } - unmatched_scan_files = list(unmatched_scan_files) + unmatched_dat_files = list(unmatched_dat_files) - return (unmatched_candidate_files, unmatched_scan_files) + return (unmatched_candidate_files, unmatched_dat_files) def is_full_detection_checksum_match(candidate_fileset, fileset, conn): @@ -1524,7 +1521,7 @@ def is_full_detection_checksum_match(candidate_fileset, fileset, conn): """ with conn.cursor() as cursor: cursor.execute( - "SELECT id, name FROM file WHERE detection=1 AND fileset = %s", + "SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name FROM file WHERE detection=1 AND fileset = %s", (candidate_fileset,), ) target_files = cursor.fetchall() @@ -1682,7 +1679,7 @@ def set_process( console_message = "Candidate filtering finished." console_log(console_message) console_message = ( - f"{dropped_early_no_candidate} Filesets Dropped - No candidates found." + f"{dropped_early_no_candidate} Filesets Dropped for No candidates." ) console_log(console_message) console_message = "Looking for duplicates..." @@ -1872,9 +1869,15 @@ def set_perform_match( matched_fileset_id, manual_merge_map, set_to_candidate_dict, conn ) elif status == "partial" or status == "full": - (is_match, unmatched_files) = is_full_checksum_match( + (unmatched_candidate_files, unmatched_dat_files) = get_unmatched_files( matched_fileset_id, fileset, conn ) + is_match = ( + True + if len(unmatched_candidate_files) == 0 + and len(unmatched_dat_files) == 0 + else False + ) if is_match: category_text = "Already present" log_text = f"Already present as - Fileset:{matched_fileset_id}. Deleting Fileset:{fileset_id}" @@ -1890,7 +1893,8 @@ def set_perform_match( else: category_text = "Mismatch" - log_text = f"Fileset:{fileset_id} mismatched with Fileset:{matched_fileset_id} with status:{status}. Try manual merge." + log_text = f"Fileset:{fileset_id} mismatched with Fileset:{matched_fileset_id} with status:{status}. Try manual merge. Unmatched Files in set.dat fileset = {len(unmatched_dat_files)} Unmatched Files in candidate fileset = {len(unmatched_candidate_files)}. List of unmatched files scan.dat : {', '.join(scan_file for scan_file in unmatched_dat_files)}, List of unmatched files full fileset : {', '.join(scan_file for scan_file in unmatched_candidate_files)}" + console_log(log_text) # print_text = f"Merge Fileset:{fileset_id} manually with Fileset:{matched_fileset_id}. Unmatched files: {len(unmatched_files)}." mismatch_filesets += 1 add_manual_merge( @@ -1904,7 +1908,6 @@ def set_perform_match( elif len(candidate_filesets) > 1: found_match = False - for candidate_fileset in candidate_filesets: (is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn) if is_match: From ff9934f5cfb55ce58125c4d332e185e00183b2b1 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Wed, 16 Jul 2025 18:46:46 +0530 Subject: [PATCH 24/30] INTEGRITY: Add checksum based filtering in set.dat, when possible. --- db_functions.py | 134 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 97 insertions(+), 37 deletions(-) diff --git a/db_functions.py b/db_functions.py index 6906a64..3a4fc12 100644 --- a/db_functions.py +++ b/db_functions.py @@ -977,7 +977,9 @@ def scan_process( id_to_fileset_mapping = defaultdict(dict) + fileset_count = 0 for fileset in game_data: + console_log_file_update(fileset_count) key = calc_key(fileset) megakey = "" log_text = f"State {source_status}." @@ -1003,9 +1005,12 @@ def scan_process( filesets_check_for_full = set() for rom in fileset["rom"]: - scan_update_files(rom, filesets_check_for_full, transaction_id, conn) + pre_update_files(rom, filesets_check_for_full, transaction_id, conn) + fileset_count += 1 + fileset_count = 0 for fileset_id, fileset in id_to_fileset_mapping.items(): + console_log_matching(fileset_count) candidate_filesets = scan_filter_candidate_filesets( fileset_id, fileset, transaction_id, conn ) @@ -1047,6 +1052,7 @@ def scan_process( conn, skiplog, ) + fileset_count += 1 # Final log with conn.cursor() as cursor: @@ -1063,7 +1069,7 @@ def scan_process( create_log(escape_string(category_text), user, escape_string(log_text), conn) -def scan_update_files(rom, filesets_check_for_full, transaction_id, conn): +def pre_update_files(rom, filesets_check_for_full, transaction_id, conn): """ Updates all the checksums for the files matching by a checksum and size. """ @@ -1074,6 +1080,9 @@ def scan_update_files(rom, filesets_check_for_full, transaction_id, conn): checksums[key] = rom[key] files_to_update = set() + size = rom["size"] if "size" in rom else 0 + size_r = rom["size-r"] if "size-r" in rom else 0 + size_rd = rom["size-rd"] if "size-rd" in rom else 0 for _, checksum in checksums.items(): query = """ @@ -1088,9 +1097,7 @@ def scan_update_files(rom, filesets_check_for_full, transaction_id, conn): AND f.`size-rd` = %s AND t.transaction != %s """ - size = rom["size"] if "size" in rom else 0 - size_r = rom["size-r"] if "size-r" in rom else 0 - size_rd = rom["size-rd"] if "size-rd" in rom else 0 + cursor.execute(query, (checksum, size, size_r, size_rd, transaction_id)) result = cursor.fetchall() if result: @@ -1104,12 +1111,20 @@ def scan_update_files(rom, filesets_check_for_full, transaction_id, conn): WHERE file = %s """ cursor.execute(query, (file_id,)) + # Update checksums for check, checksum in checksums.items(): checksize, checktype, checksum = get_checksum_props(check, checksum) query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)" cursor.execute(query, (file_id, checksize, checktype, checksum)) - - conn.commit() + # Update sizes + query = """ + UPDATE file + SET size = %s, + `size-r` = %s, + `size-rd` = %s, + WHERE id = %s + """ + cursor.execute(query, size, size_r, size_rd, file_id) def scan_perform_match( @@ -1907,31 +1922,7 @@ def set_perform_match( ) elif len(candidate_filesets) > 1: - found_match = False - for candidate_fileset in candidate_filesets: - (is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn) - if is_match: - update_fileset_status(cursor, candidate_fileset, "partial") - set_populate_file(fileset, candidate_fileset, conn, detection) - auto_merged_filesets += 1 - if not skiplog: - log_matched_fileset( - src, - fileset_id, - candidate_fileset, - "partial", - user, - conn, - ) - delete_original_fileset(fileset_id, conn) - remove_manual_merge_if_size_mismatch( - candidate_fileset, manual_merge_map, set_to_candidate_dict, conn - ) - found_match = True - break - - if not found_match: - manual_merge_map[fileset_id] = candidate_filesets + manual_merge_map[fileset_id] = candidate_filesets return ( fully_matched_filesets, @@ -2160,8 +2151,7 @@ def set_filter_candidate_filesets( JOIN game g ON g.id = fs.game JOIN engine e ON e.id = g.engine JOIN transactions t ON t.fileset = fs.id - WHERE fs.id != %s - AND e.engineid = %s + WHERE e.engineid = %s AND f.detection = 1 AND t.transaction != %s ), @@ -2199,9 +2189,7 @@ def set_filter_candidate_filesets( JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count """ - cursor.execute( - query, (fileset_id, fileset["sourcefile"], transaction_id, fileset_id) - ) + cursor.execute(query, (fileset["sourcefile"], transaction_id, fileset_id)) rows = cursor.fetchall() candidates = [] @@ -2209,9 +2197,76 @@ def set_filter_candidate_filesets( for row in rows: candidates.append(row["fileset_id"]) + matched_candidates = [] + + candidates = [ + candidate + for candidate in candidates + if is_candidate_by_checksize(candidate, fileset, conn) + ] + + for candidate in candidates: + if is_full_detection_checksum_match(candidate, fileset, conn): + matched_candidates.append(candidate) + + if len(matched_candidates) != 0: + candidates = matched_candidates + return (candidates, fileset_count) +def is_candidate_by_checksize(candidate, fileset, conn): + with conn.cursor() as cursor: + cursor.execute( + "SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE detection=1 AND fileset = %s", + (candidate,), + ) + target_files = cursor.fetchall() + candidate_files = { + target_file["id"]: [target_file["name"], target_file["size"]] + for target_file in target_files + } + + # set of (checksum, filename) + scan_checksums = set() + for file in fileset["rom"]: + for key in file: + if key.startswith("md5"): + name = os.path.basename(normalised_path(file["name"])) + scan_checksums.add((file[key], name.lower())) + + for detection_file_id, [ + detection_file_name, + detection_file_size, + ] in candidate_files.items(): + query = """ + SELECT fc.checksum, fc.checksize, fc.checktype + FROM filechecksum fc + WHERE fc.file = %s + """ + cursor.execute(query, (detection_file_id,)) + checksums_info = cursor.fetchall() + if checksums_info: + for checksum_info in checksums_info: + checksum = checksum_info["checksum"] + checksize = checksum_info["checksize"] + if checksize == "1M": + checksize = 1048576 + if ( + ( + checksum, + os.path.basename(detection_file_name.lower()), + ) + not in scan_checksums + and detection_file_size <= int(checksize) + and detection_file_size != -1 + ): + continue + else: + return True + return False + + def process_fileset( fileset, resources, @@ -2972,6 +3027,11 @@ def console_log_candidate_filtering(fileset_count): sys.stdout.flush() +def console_log_file_update(fileset_count): + sys.stdout.write(f"Updating files - Fileset {fileset_count}\r") + sys.stdout.flush() + + def console_log_matching(fileset_count): sys.stdout.write(f"Performing Match - Fileset {fileset_count}\r") sys.stdout.flush() From ca9d4a7a2a9330373e585d2948ed63405128964d Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Thu, 17 Jul 2025 17:29:48 +0530 Subject: [PATCH 25/30] INTEGRITY: Remove 'obsolete' fileset status entirely. --- db_functions.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/db_functions.py b/db_functions.py index 3a4fc12..c4e65a2 100644 --- a/db_functions.py +++ b/db_functions.py @@ -146,9 +146,6 @@ def insert_fileset( cursor.execute( f"UPDATE fileset SET `timestamp` = FROM_UNIXTIME(@fileset_time_last) WHERE id = {existing_entry}" ) - cursor.execute( - f"UPDATE fileset SET status = 'detection' WHERE id = {existing_entry} AND status = 'obsolete'" - ) cursor.execute(f"SELECT status FROM fileset WHERE id = {existing_entry}") status = cursor.fetchone()["status"] if status == "user": @@ -610,10 +607,6 @@ def db_insert(data_arr, username=None, skiplog=False): fileset_count += 1 - if detection: - conn.cursor().execute( - "UPDATE fileset SET status = 'obsolete' WHERE `timestamp` != FROM_UNIXTIME(@fileset_time_last) AND status = 'detection'" - ) cur = conn.cursor() try: From a30364586988c4df01427fa16b2942b57be02981 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Thu, 17 Jul 2025 17:31:57 +0530 Subject: [PATCH 26/30] INTEGRITY: Add checksum based filtering before filtering by maximum number of files matched. --- db_functions.py | 155 +++++++++++++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 62 deletions(-) diff --git a/db_functions.py b/db_functions.py index c4e65a2..20a3240 100644 --- a/db_functions.py +++ b/db_functions.py @@ -2130,15 +2130,17 @@ def set_filter_candidate_filesets( fileset_id, fileset, fileset_count, transaction_id, conn ): """ - Returns a list of candidate filesets that can be merged + Returns a list of candidate filesets that can be merged. + Performs early filtering in SQL (by engine, name, size) and then + applies checksum filtering and max-match filtering in Python. """ with conn.cursor() as cursor: - # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches fileset_count += 1 console_log_candidate_filtering(fileset_count) + + # Early filter candidates using enginename, filename and size query = """ - WITH candidate_fileset AS ( - SELECT fs.id AS fileset_id, f.name, f.size + SELECT fs.id AS fileset_id, f.id AS file_id, f.name, f.size FROM file f JOIN fileset fs ON f.fileset = fs.id JOIN game g ON g.id = fs.game @@ -2147,65 +2149,94 @@ def set_filter_candidate_filesets( WHERE e.engineid = %s AND f.detection = 1 AND t.transaction != %s - ), - total_detection_files AS ( - SELECT cf.fileset_id, COUNT(*) AS detection_files_found - FROM candidate_fileset cf - GROUP BY fileset_id - ), - set_fileset AS ( - SELECT name, size FROM file - WHERE fileset = %s - ), - matched_detection_files AS ( - SELECT cf.fileset_id, COUNT(*) AS match_files_count - FROM candidate_fileset cf - JOIN set_fileset sf ON ( ( - cf.name = sf.name - OR - REGEXP_REPLACE(cf.name, '^.*[\\\\/]', '') = REGEXP_REPLACE(sf.name, '^.*[\\\\/]', '') - ) AND (cf.size = sf.size OR cf.size = -1) ) - GROUP BY cf.fileset_id - ), - valid_matched_detection_files AS ( - SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count - FROM matched_detection_files mdf - JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id - WHERE tdf.detection_files_found <= mdf.match_files_count - ), - max_match_count AS ( - SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files - ) - SELECT vmdf.fileset_id - FROM valid_matched_detection_files vmdf - JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id - JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count """ - - cursor.execute(query, (fileset["sourcefile"], transaction_id, fileset_id)) - rows = cursor.fetchall() - - candidates = [] - if rows: - for row in rows: - candidates.append(row["fileset_id"]) - - matched_candidates = [] - - candidates = [ - candidate - for candidate in candidates - if is_candidate_by_checksize(candidate, fileset, conn) - ] - - for candidate in candidates: - if is_full_detection_checksum_match(candidate, fileset, conn): - matched_candidates.append(candidate) - - if len(matched_candidates) != 0: - candidates = matched_candidates - - return (candidates, fileset_count) + cursor.execute(query, (fileset["sourcefile"], transaction_id)) + raw_candidates = cursor.fetchall() + + # fileset id to detection files map + candidate_map = defaultdict(list) + total_detection_files_map = defaultdict(int) + for row in raw_candidates: + candidate_map[row["fileset_id"]].append( + { + "file_id": row["file_id"], + "name": row["name"], + "size": row["size"], + } + ) + for id, files in candidate_map.items(): + total_detection_files_map[id] = len(files) + + set_checksums = set() + set_file_name_size = set() + for file in fileset["rom"]: + for key in file: + if key.startswith("md5"): + name = os.path.basename(normalised_path(file["name"])) + set_checksums.add((file[key], name.lower(), int(file["size"]))) + set_checksums.add((file[key], name.lower(), -1)) + set_file_name_size.add((name.lower(), -1)) + set_file_name_size.add((name.lower(), int(file["size"]))) + + # Filter candidates by detection filename and file size (including -1) and increase matched file count + # if filesize = -1, + # elif filesize <= checksize and checksum matches, + # elif filesize > checksize. + match_counts = {} + for fileset_id, files in candidate_map.items(): + count = 0 + with conn.cursor() as cursor: + for f in files: + filename = os.path.basename(f["name"]).lower() + filesize = f["size"] + if (filename, filesize) in set_file_name_size: + if filesize == -1: + count += 1 + else: + cursor.execute( + """ + SELECT checksum, checksize, checktype + FROM filechecksum + WHERE file = %s + """, + (f["file_id"],), + ) + checksums = cursor.fetchall() + not_inc_count = False + for c in checksums: + checksum = c["checksum"] + checksize = c["checksize"] + if checksize == "1M": + checksize = 1048576 + elif checksize == "0": + checksize = filesize + if filesize <= int(checksize): + if (checksum, filename, filesize) in set_checksums: + count += 1 + not_inc_count = True + # if it was a true match, checksum should be present + break + if not not_inc_count: + count += 1 + if count > 0 and total_detection_files_map[fileset_id] <= count: + match_counts[fileset_id] = count + + # Filter only entries with maximum number of matched files + if not match_counts: + return ([], fileset_count) + + max_match = max(match_counts.values()) + candidates = [fid for fid, count in match_counts.items() if count == max_match] + + matched_candidates = [] + for candidate in candidates: + if is_full_detection_checksum_match(candidate, fileset, conn): + matched_candidates.append(candidate) + + if len(matched_candidates) != 0: + candidates = matched_candidates + + return (candidates, fileset_count) def is_candidate_by_checksize(candidate, fileset, conn): From 4dd7e297c842b9b399ca6b44bbdc56738877f003 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Thu, 17 Jul 2025 19:47:06 +0530 Subject: [PATCH 27/30] INTEGRITY: Merge one of the entries from dropped duplicate entries. Drop others. --- db_functions.py | 50 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/db_functions.py b/db_functions.py index 20a3240..f9e9fe0 100644 --- a/db_functions.py +++ b/db_functions.py @@ -1693,7 +1693,7 @@ def set_process( console_message = "Looking for duplicates..." console_log(console_message) - # Remove all such filesets, which have many to one mapping with a single candidate, those are extra variants. + # Remove all such filesets, which have many to one mapping with a single candidate, just merge one of them. value_to_keys = defaultdict(list) for set_fileset, candidates in set_to_candidate_dict.items(): if len(candidates) == 1: @@ -1717,7 +1717,12 @@ def set_process( platform = result["platform"] language = result["language"] + # Skip the first entry, let it merge and drop others + skip = True for set_fileset in set_filesets: + if skip: + skip = False + continue fileset = id_to_fileset_dict[set_fileset] category_text = "Drop fileset - Duplicates" fileset_name = fileset["name"] if "name" in fileset else "" @@ -1742,9 +1747,9 @@ def set_process( fileset = id_to_fileset_dict[fileset_id] # Filter by platform to reduce manual merge - candidate_filesets = set_filter_by_platform( - fileset["name"], candidate_filesets, conn - ) + # candidate_filesets = set_filter_by_platform( + # fileset["name"], candidate_filesets, conn + # ) ( fully_matched_filesets, @@ -1771,16 +1776,35 @@ def set_process( match_count += 1 console_log("Matching performed.") - for fileset_id, candidates in manual_merge_map.items(): - category_text = "Manual Merge Required" - log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidates)}." - manual_merged_filesets += 1 - add_manual_merge( - candidates, fileset_id, category_text, log_text, user, conn, log_text - ) - - # Final log with conn.cursor() as cursor: + for fileset_id, candidates in manual_merge_map.items(): + if len(candidates) == 0: + category_text = "Drop fileset - No Candidates" + fileset = id_to_fileset_dict[fileset_id] + fileset_name = fileset["name"] if "name" in fileset else "" + fileset_description = ( + fileset["description"] if "description" in fileset else "" + ) + log_text = f"Drop fileset as no matching candidates. Name: {fileset_name}, Description: {fileset_description}." + create_log( + escape_string(category_text), user, escape_string(log_text), conn + ) + dropped_early_no_candidate += 1 + delete_original_fileset(fileset_id, conn) + else: + category_text = "Manual Merge Required" + log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidates)}." + manual_merged_filesets += 1 + add_manual_merge( + candidates, + fileset_id, + category_text, + log_text, + user, + conn, + log_text, + ) + cursor.execute( "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s", (transaction_id,), From 57df340d9f267dc2e88649d4c377fede6f589e87 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Thu, 17 Jul 2025 20:42:51 +0530 Subject: [PATCH 28/30] INTEGRITY: Merge filtering logic for glk with existing set.dat filtering. --- db_functions.py | 119 ++++++++++++------------------------------------ 1 file changed, 28 insertions(+), 91 deletions(-) diff --git a/db_functions.py b/db_functions.py index f9e9fe0..897d346 100644 --- a/db_functions.py +++ b/db_functions.py @@ -1659,14 +1659,9 @@ def set_process( # Separating out the matching logic for glk engine engine_name = fileset["sourcefile"].split("-")[0] - if engine_name == "glk": - (candidate_filesets, fileset_count) = set_glk_filter_candidate_filesets( - fileset_id, fileset, fileset_count, transaction_id, engine_name, conn - ) - else: - (candidate_filesets, fileset_count) = set_filter_candidate_filesets( - fileset_id, fileset, fileset_count, transaction_id, conn - ) + (candidate_filesets, fileset_count) = set_filter_candidate_filesets( + fileset_id, fileset, fileset_count, transaction_id, engine_name, conn + ) # Mac files in set.dat are not represented properly and they won't find a candidate fileset for a match, so we can drop them. if len(candidate_filesets) == 0: @@ -2071,93 +2066,16 @@ def is_full_checksum_match(candidate_fileset, fileset, conn): return (len(unmatched_files) == 0, unmatched_files) -def set_glk_filter_candidate_filesets( - fileset_id, fileset, fileset_count, transaction_id, engine_name, conn -): - """ - Returns a list of candidate filesets for glk engines that can be merged - """ - with conn.cursor() as cursor: - # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches - fileset_count += 1 - console_log_candidate_filtering(fileset_count) - query = """ - WITH candidate_fileset AS ( - SELECT fs.id AS fileset_id, f.size - FROM file f - JOIN fileset fs ON f.fileset = fs.id - JOIN game g ON g.id = fs.game - JOIN engine e ON e.id = g.engine - JOIN transactions t ON t.fileset = fs.id - WHERE fs.id != %s - AND e.engineid = %s - AND f.detection = 1 - AND t.transaction != %s - AND (g.gameid = %s OR (g.gameid != %s AND g.gameid LIKE %s)) - ), - total_detection_files AS ( - SELECT cf.fileset_id, COUNT(*) AS detection_files_found - FROM candidate_fileset cf - GROUP BY fileset_id - ), - set_fileset AS ( - SELECT size FROM file - WHERE fileset = %s - ), - matched_detection_files AS ( - SELECT cf.fileset_id, COUNT(*) AS match_files_count - FROM candidate_fileset cf - JOIN set_fileset sf ON - cf.size = sf.size OR cf.size = 0 - GROUP BY cf.fileset_id - ), - valid_matched_detection_files AS ( - SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count - FROM matched_detection_files mdf - JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id - WHERE tdf.detection_files_found <= mdf.match_files_count - ), - max_match_count AS ( - SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files - ) - SELECT vmdf.fileset_id - FROM valid_matched_detection_files vmdf - JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id - JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count - """ - - gameid_pattern = f"%{fileset['name']}%" - - cursor.execute( - query, - ( - fileset_id, - engine_name, - transaction_id, - fileset["name"], - fileset["name"], - gameid_pattern, - fileset_id, - ), - ) - rows = cursor.fetchall() - - candidates = [] - if rows: - for row in rows: - candidates.append(row["fileset_id"]) - - return (candidates, fileset_count) - - def set_filter_candidate_filesets( - fileset_id, fileset, fileset_count, transaction_id, conn + fileset_id, fileset, fileset_count, transaction_id, engine_name, conn ): """ Returns a list of candidate filesets that can be merged. Performs early filtering in SQL (by engine, name, size) and then applies checksum filtering and max-match filtering in Python. + In case of glk engines, filtering is not by name, rather gameid is used. """ + is_glk = engine_name == "glk" with conn.cursor() as cursor: fileset_count += 1 console_log_candidate_filtering(fileset_count) @@ -2174,7 +2092,21 @@ def set_filter_candidate_filesets( AND f.detection = 1 AND t.transaction != %s """ - cursor.execute(query, (fileset["sourcefile"], transaction_id)) + if is_glk: + query += " AND (g.gameid = %s OR (g.gameid != %s AND g.gameid LIKE %s))" + gameid_pattern = f"%{fileset['name']}%" + cursor.execute( + query, + ( + engine_name, + transaction_id, + fileset["name"], + fileset["name"], + gameid_pattern, + ), + ) + else: + cursor.execute(query, (fileset["sourcefile"], transaction_id)) raw_candidates = cursor.fetchall() # fileset id to detection files map @@ -2184,7 +2116,7 @@ def set_filter_candidate_filesets( candidate_map[row["fileset_id"]].append( { "file_id": row["file_id"], - "name": row["name"], + "name": os.path.basename(normalised_path(row["name"])).lower(), "size": row["size"], } ) @@ -2193,14 +2125,17 @@ def set_filter_candidate_filesets( set_checksums = set() set_file_name_size = set() + set_glk_file_size = set() for file in fileset["rom"]: + name = os.path.basename(normalised_path(file["name"])) for key in file: if key.startswith("md5"): - name = os.path.basename(normalised_path(file["name"])) set_checksums.add((file[key], name.lower(), int(file["size"]))) set_checksums.add((file[key], name.lower(), -1)) set_file_name_size.add((name.lower(), -1)) set_file_name_size.add((name.lower(), int(file["size"]))) + if is_glk: + set_glk_file_size.add(int(file["size"])) # Filter candidates by detection filename and file size (including -1) and increase matched file count # if filesize = -1, @@ -2213,6 +2148,8 @@ def set_filter_candidate_filesets( for f in files: filename = os.path.basename(f["name"]).lower() filesize = f["size"] + if is_glk and (filesize in set_glk_file_size or filesize == 0): + count += 1 if (filename, filesize) in set_file_name_size: if filesize == -1: count += 1 From c9dca049ba2f3ef18820ce778bf0610fa1b35c02 Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Fri, 18 Jul 2025 00:40:57 +0530 Subject: [PATCH 29/30] INTEGRITY: Add checksum filtering before max files filtering in scan.dat processing --- db_functions.py | 183 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 130 insertions(+), 53 deletions(-) diff --git a/db_functions.py b/db_functions.py index 897d346..2ae4778 100644 --- a/db_functions.py +++ b/db_functions.py @@ -1059,6 +1059,7 @@ def scan_process( create_log(escape_string(category_text), user, escape_string(log_text), conn) category_text = "Upload information" log_text = f"Number of filesets: {fileset_insertion_count}. Filesets automatically merged: {automatic_merged_filesets}. Filesets requiring manual merge (multiple candidates): {manual_merged_filesets}. Filesets requiring manual merge (matched with detection): {manual_merged_with_detection}. Filesets dropped, no candidate: {dropped_early_no_candidate}. Filesets matched with existing Full fileset: {match_with_full_fileset}. Filesets with mismatched files with Full fileset: {mismatch_with_full_fileset}. Filesets missing files compared to partial fileset candidate: {filesets_with_missing_files}." + console_log(log_text) create_log(escape_string(category_text), user, escape_string(log_text), conn) @@ -1115,9 +1116,12 @@ def pre_update_files(rom, filesets_check_for_full, transaction_id, conn): SET size = %s, `size-r` = %s, `size-rd` = %s, + name = %s WHERE id = %s """ - cursor.execute(query, size, size_r, size_rd, file_id) + cursor.execute( + query, (size, size_r, size_rd, normalised_path(rom["name"]), file_id) + ) def scan_perform_match( @@ -1396,73 +1400,146 @@ def total_fileset_files(fileset): def scan_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn): """ - Returns a list of candidate filesets that can be merged + Returns a list of candidate filesets that can be merged. + Performs early filtering in SQL (by name, size) and then + applies checksum filtering and max-match filtering in Python. """ with conn.cursor() as cursor: - # Returns those filesets which have all detection files matching in the scan fileset filtered by file name and file size(if not -1). - + # Fetching detection filename and all sizes (size, size-r, size-rd) from database query = """ - WITH candidate_fileset AS ( - SELECT fs.id AS fileset_id, f.name, f.size, + SELECT fs.id AS fileset_id, f.id as file_id, f.name, f.size, f.`size-r` AS size_r, f.`size-rd` AS size_rd FROM file f JOIN fileset fs ON f.fileset = fs.id JOIN game g ON g.id = fs.game + JOIN engine e ON e.id = g.engine JOIN transactions t ON t.fileset = fs.id WHERE f.detection = 1 AND t.transaction != %s - ), - total_detection_files AS ( - SELECT cf.fileset_id, COUNT(*) AS detection_files_found - FROM candidate_fileset cf - GROUP BY fileset_id - ), - set_fileset AS ( - SELECT name, size, - `size-r` AS size_r, `size-rd` AS size_rd - FROM file - WHERE fileset = %s - ), - matched_detection_files AS ( - SELECT cf.fileset_id, COUNT(*) AS match_files_count - FROM candidate_fileset cf - JOIN set_fileset sf ON ( ( - cf.name = sf.name - OR - REGEXP_REPLACE(cf.name, '^.*[\\\\/]', '') = REGEXP_REPLACE(sf.name, '^.*[\\\\/]', '') - ) AND (cf.size = sf.size OR cf.size = -1) - AND (cf.size_r = sf.size_r) - AND (cf.size_rd = sf.size_rd)) - GROUP BY cf.fileset_id - ), - valid_matched_detection_files AS ( - SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count - FROM matched_detection_files mdf - JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id - WHERE tdf.detection_files_found <= mdf.match_files_count - ), - max_match_count AS ( - SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files - ) - SELECT vmdf.fileset_id - FROM valid_matched_detection_files vmdf - JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id - JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count """ + cursor.execute(query, (transaction_id,)) + raw_candidates = cursor.fetchall() + + # fileset id to detection files map + candidate_map = defaultdict(list) + total_detection_files_map = defaultdict(int) + for row in raw_candidates: + candidate_map[row["fileset_id"]].append( + { + "file_id": row["file_id"], + "name": os.path.basename(normalised_path(row["name"])).lower(), + "size": row["size"], + "size-r": row["size_r"], + "size-rd": row["size_rd"], + } + ) + for id, files in candidate_map.items(): + total_detection_files_map[id] = len(files) + + set_checksums = set() + set_file_name_size = set() + for file in fileset["rom"]: + name = os.path.basename(normalised_path(file["name"])) + for key in file: + if key.startswith("md5"): + set_checksums.add( + ( + file[key], + name.lower(), + int(file["size"]), + int(file["size-r"]), + int(file["size-rd"]), + ) + ) + set_checksums.add( + ( + file[key], + name.lower(), + -1, + int(file["size-r"]), + int(file["size-rd"]), + ) + ) + set_file_name_size.add( + (name.lower(), -1, int(file["size-r"]), int(file["size-rd"])) + ) + set_file_name_size.add( + (name.lower(), int(file["size"]), int(file["size-r"]), int(file["size-rd"])) + ) + + # Filter candidates by detection filename and file size (including -1) and increase matched file count + # if filesize = -1, + # elif filesize <= checksize and checksum matches, + # elif filesize > checksize. + match_counts = {} + for fileset_id, files in candidate_map.items(): + count = 0 + with conn.cursor() as cursor: + for f in files: + filename = os.path.basename(f["name"]).lower() + size = f["size"] + size_r = f["size-r"] + size_rd = f["size-rd"] + if (filename, size, size_r, size_rd) in set_file_name_size: + if size == -1: + count += 1 + else: + cursor.execute( + """ + SELECT checksum, checksize, checktype + FROM filechecksum + WHERE file = %s + """, + (f["file_id"],), + ) + checksums = cursor.fetchall() + not_inc_count = False + for c in checksums: + filesize = size + checksum = c["checksum"] + checksize = c["checksize"] + checktype = c["checktype"] + # Macfiles handling + if checktype in ["md5-r", "md5-rt"]: + filesize = size_rd - cursor.execute(query, (transaction_id, fileset_id)) - rows = cursor.fetchall() + if checksize == "1M": + checksize = 1048576 + elif checksize == "0": + checksize = filesize + if filesize <= int(checksize): + if ( + checksum, + filename, + size, + size_r, + size_rd, + ) in set_checksums: + count += 1 + not_inc_count = True + # if it was a true match, checksum should be present + break + if not not_inc_count: + count += 1 + if count > 0 and total_detection_files_map[fileset_id] <= count: + match_counts[fileset_id] = count + + # Filter only entries with maximum number of matched files + if not match_counts: + return [] - candidates = [] - if rows: - for row in rows: - candidates.append(row["fileset_id"]) + max_match = max(match_counts.values()) + candidates = [fid for fid, count in match_counts.items() if count == max_match] - for candidate in candidates: - if not is_full_detection_checksum_match(candidate, fileset, conn): - candidates.remove(candidate) + matched_candidates = [] + for candidate in candidates: + if is_full_detection_checksum_match(candidate, fileset, conn): + matched_candidates.append(candidate) + + if len(matched_candidates) != 0: + candidates = matched_candidates - return candidates + return candidates def get_unmatched_files(candidate_fileset, fileset, conn): From 5e54aa690e6c6c6672902b63277a12eed9054b8b Mon Sep 17 00:00:00 2001 From: ShivangNagta Date: Fri, 18 Jul 2025 02:28:22 +0530 Subject: [PATCH 30/30] INTEGRITY: Parameterising all sql queries in db_functions.py --- db_functions.py | 187 +++++++++++++++++++++++++++--------------------- 1 file changed, 104 insertions(+), 83 deletions(-) diff --git a/db_functions.py b/db_functions.py index 2ae4778..490bceb 100644 --- a/db_functions.py +++ b/db_functions.py @@ -79,24 +79,26 @@ def insert_game(engine_name, engineid, title, gameid, extra, platform, lang, con # Set @engine_last if engine already present in table exists = False with conn.cursor() as cursor: - cursor.execute(f"SELECT id FROM engine WHERE engineid = '{engineid}'") + cursor.execute("SELECT id FROM engine WHERE engineid = %s", (engineid,)) res = cursor.fetchone() if res is not None: exists = True - cursor.execute(f"SET @engine_last = '{res['id']}'") + cursor.execute("SET @engine_last = %s", (res["id"],)) # Insert into table if not present if not exists: with conn.cursor() as cursor: cursor.execute( - f"INSERT INTO engine (name, engineid) VALUES ('{escape_string(engine_name)}', '{engineid}')" + "INSERT INTO engine (name, engineid) VALUES (%s, %s)", + (engine_name, engineid), ) cursor.execute("SET @engine_last = LAST_INSERT_ID()") # Insert into game with conn.cursor() as cursor: cursor.execute( - f"INSERT INTO game (name, engine, gameid, extra, platform, language) VALUES ('{escape_string(title)}', @engine_last, '{gameid}', '{escape_string(extra)}', '{platform}', '{lang}')" + "INSERT INTO game (name, engine, gameid, extra, platform, language) VALUES (%s, @engine_last, %s, %s, %s, %s)", + (title, gameid, extra, platform, lang), ) cursor.execute("SET @game_last = LAST_INSERT_ID()") @@ -129,24 +131,27 @@ def insert_fileset( # Check if key/megakey already exists, if so, skip insertion (no quotes on purpose) if detection: with conn.cursor() as cursor: - cursor.execute(f"SELECT id FROM fileset WHERE megakey = {megakey}") + cursor.execute("SELECT id FROM fileset WHERE megakey = %s", (megakey,)) existing_entry = cursor.fetchone() else: with conn.cursor() as cursor: - cursor.execute(f"SELECT id FROM fileset WHERE `key` = {key}") + cursor.execute("SELECT id FROM fileset WHERE `key` = %s", (key,)) existing_entry = cursor.fetchone() if existing_entry is not None: existing_entry = existing_entry["id"] with conn.cursor() as cursor: - cursor.execute(f"SET @fileset_last = {existing_entry}") - cursor.execute(f"DELETE FROM file WHERE fileset = {existing_entry}") + cursor.execute("SET @fileset_last = %s", (existing_entry,)) + cursor.execute("DELETE FROM file WHERE fileset = %s", (existing_entry,)) cursor.execute( - f"UPDATE fileset SET `timestamp` = FROM_UNIXTIME(@fileset_time_last) WHERE id = {existing_entry}" + "UPDATE fileset SET `timestamp` = FROM_UNIXTIME(@fileset_time_last) WHERE id = %s", + (existing_entry,), + ) + cursor.execute( + "SELECT status FROM fileset WHERE id = %s", (existing_entry,) ) - cursor.execute(f"SELECT status FROM fileset WHERE id = {existing_entry}") status = cursor.fetchone()["status"] if status == "user": add_usercount(existing_entry, conn) @@ -162,10 +167,10 @@ def insert_fileset( return (existing_entry, True) # $game and $key should not be parsed as a mysql string, hence no quotes - query = f"INSERT INTO fileset (game, status, src, `key`, megakey, `timestamp`, set_dat_metadata) VALUES ({game}, '{status}', '{src}', {key}, {megakey}, FROM_UNIXTIME(@fileset_time_last), '{escape_string(set_dat_metadata)}')" + query = "INSERT INTO fileset (game, status, src, `key`, megakey, `timestamp`, set_dat_metadata) VALUES (%s, %s, %s, %s, %s, FROM_UNIXTIME(@fileset_time_last), %s)" fileset_id = -1 with conn.cursor() as cursor: - cursor.execute(query) + cursor.execute(query, (game, status, src, key, megakey, set_dat_metadata)) fileset_id = cursor.lastrowid cursor.execute("SET @fileset_last = LAST_INSERT_ID()") @@ -188,7 +193,8 @@ def insert_fileset( update_history(0, fileset_last, conn) with conn.cursor() as cursor: cursor.execute( - f"INSERT INTO transactions (`transaction`, fileset) VALUES ({transaction}, {fileset_last})" + "INSERT INTO transactions (`transaction`, fileset) VALUES (%s, %s)", + (transaction, fileset_last), ) return (fileset_id, False) @@ -230,17 +236,11 @@ def insert_file(file, detection, src, conn): values.append(file["size"] if "size" in file else "0") values.append(file["size-r"] if "size-r" in file else "0") values.append(file["size-rd"] if "size-rd" in file else "0") - - modification_time = file["modification-time"] if "modification-time" in file else "" - values.append(modification_time) - + values.append(file["modification-time"] if "modification-time" in file else "") values.extend([checksum, detection, detection_type]) # Parameterised Query - placeholders = ( - ["%s"] * (len(values[:6])) + ["@fileset_last"] + ["%s"] * 2 + ["NOW()"] - ) - query = f"INSERT INTO file ( name, size, `size-r`, `size-rd`, `modification-time`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ({', '.join(placeholders)})" + query = "INSERT INTO file ( name, size, `size-r`, `size-rd`, `modification-time`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES (%s, %s, %s, %s, %s, %s, @fileset_last, %s, %s, NOW())" with conn.cursor() as cursor: cursor.execute(query, values) @@ -248,7 +248,8 @@ def insert_file(file, detection, src, conn): if detection: with conn.cursor() as cursor: cursor.execute( - f"UPDATE fileset SET detection_size = {checksize} WHERE id = @fileset_last AND detection_size IS NULL" + "UPDATE fileset SET detection_size = %s WHERE id = @fileset_last AND detection_size IS NULL", + (checksize,), ) with conn.cursor() as cursor: cursor.execute("SET @file_last = LAST_INSERT_ID()") @@ -279,7 +280,7 @@ def add_all_equal_checksums(checksize, checktype, checksum, file_id, conn): if checktype[-1] == "r": size_name += "-rd" - cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = {file_id}") + cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = %s", (file_id,)) result = cursor.fetchone() if not result: return @@ -375,9 +376,10 @@ def punycode_need_encode(orig): def create_log(category, user, text, conn): query = f"INSERT INTO log (`timestamp`, category, user, `text`) VALUES (FROM_UNIXTIME({int(time.time())}), '{escape_string(category)}', '{escape_string(user)}', '{escape_string(text)}')" + query = "INSERT INTO log (`timestamp`, category, user, `text`) VALUES (FROM_UNIXTIME(%s), %s, %s, %s)" with conn.cursor() as cursor: try: - cursor.execute(query) + cursor.execute(query, (int(time.time()), category, user, text)) conn.commit() except Exception as e: conn.rollback() @@ -390,10 +392,12 @@ def create_log(category, user, text, conn): def update_history(source_id, target_id, conn, log_last=None): - query = f"INSERT INTO history (`timestamp`, fileset, oldfileset, log) VALUES (NOW(), {target_id}, {source_id}, {log_last if log_last is not None else 0})" + query = "INSERT INTO history (`timestamp`, fileset, oldfileset, log) VALUES (NOW(), %s, %s, %s)" with conn.cursor() as cursor: try: - cursor.execute(query) + cursor.execute( + query, (target_id, source_id, log_last if log_last is not None else 0) + ) conn.commit() except Exception as e: conn.rollback() @@ -418,7 +422,8 @@ def get_all_related_filesets(fileset_id, conn, visited=None): try: with conn.cursor() as cursor: cursor.execute( - f"SELECT fileset, oldfileset FROM history WHERE fileset = {fileset_id} OR oldfileset = {fileset_id}" + "SELECT fileset, oldfileset FROM history WHERE fileset = %s OR oldfileset = %s", + (fileset_id, fileset_id), ) history_records = cursor.fetchall() @@ -516,7 +521,7 @@ def db_insert(data_arr, username=None, skiplog=False): detection = src == "scummvm" status = "detection" if detection else src - conn.cursor().execute(f"SET @fileset_time_last = {int(time.time())}") + conn.cursor().execute("SET @fileset_time_last = %s", (int(time.time()),)) with conn.cursor() as cursor: cursor.execute("SELECT MAX(`transaction`) FROM transactions") @@ -611,7 +616,8 @@ def db_insert(data_arr, username=None, skiplog=False): try: cur.execute( - f"SELECT COUNT(fileset) from transactions WHERE `transaction` = {transaction_id}" + "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s", + (transaction_id,), ) fileset_insertion_count = cur.fetchone()["COUNT(fileset)"] category_text = f"Uploaded from {src}" @@ -627,11 +633,13 @@ def db_insert(data_arr, username=None, skiplog=False): def compare_filesets(id1, id2, conn): with conn.cursor() as cursor: cursor.execute( - f"SELECT name, size, `size-r`, `size-rd`, checksum FROM file WHERE fileset = '{id1}'" + "SELECT name, size, `size-r`, `size-rd`, checksum FROM file WHERE fileset = %s", + (id1,), ) fileset1 = cursor.fetchall() cursor.execute( - f"SELECT name, size, `size-r`, `size-rd`, checksum FROM file WHERE fileset = '{id2}'" + "SELECT name, size, `size-r`, `size-rd`, checksum FROM file WHERE fileset = %s", + (id2,), ) fileset2 = cursor.fetchall() @@ -665,9 +673,9 @@ def find_matching_game(game_files): for file in game_files: checksum = file[1] - query = f"SELECT file.fileset as file_fileset FROM filechecksum JOIN file ON filechecksum.file = file.id WHERE filechecksum.checksum = '{checksum}' AND file.detection = TRUE" + query = "SELECT file.fileset as file_fileset FROM filechecksum JOIN file ON filechecksum.file = file.id WHERE filechecksum.checksum = %s AND file.detection = TRUE" with conn.cursor() as cursor: - cursor.execute(query) + cursor.execute(query, (checksum,)) records = cursor.fetchall() # If file is not part of detection entries, skip it @@ -682,7 +690,8 @@ def find_matching_game(game_files): for key, value in Counter(matching_filesets).items(): with conn.cursor() as cursor: cursor.execute( - f"SELECT COUNT(file.id) FROM file JOIN fileset ON file.fileset = fileset.id WHERE fileset.id = '{key}'" + "SELECT COUNT(file.id) FROM file JOIN fileset ON file.fileset = fileset.id WHERE fileset.id = %s", + (key,), ) count_files_in_fileset = cursor.fetchone()["COUNT(file.id)"] @@ -693,7 +702,8 @@ def find_matching_game(game_files): with conn.cursor() as cursor: cursor.execute( - f"SELECT engineid, game.id, gameid, platform, language, `key`, src, fileset.id as fileset FROM game JOIN fileset ON fileset.game = game.id JOIN engine ON engine.id = game.engine WHERE fileset.id = '{key}'" + "SELECT engineid, game.id, gameid, platform, language, `key`, src, fileset.id as fileset FROM game JOIN fileset ON fileset.game = game.id JOIN engine ON engine.id = game.engine WHERE fileset.id = %s", + (key,), ) records = cursor.fetchall() @@ -717,7 +727,7 @@ def find_matching_game(game_files): if compare_filesets(matching_games[0]["fileset"], game_files[0][0], conn): with conn.cursor() as cursor: cursor.execute( - f"UPDATE fileset SET `delete` = TRUE WHERE id = {game_files[0][0]}" + "UPDATE fileset SET `delete` = TRUE WHERE id = %s", (game_files[0][0],) ) return [] @@ -730,7 +740,8 @@ def merge_filesets(detection_id, dat_id): try: with conn.cursor() as cursor: cursor.execute( - f"SELECT DISTINCT(filechecksum.checksum), checksize, checktype FROM filechecksum JOIN file on file.id = filechecksum.file WHERE fileset = '{detection_id}'" + "SELECT DISTINCT(filechecksum.checksum), checksize, checktype FROM filechecksum JOIN file on file.id = filechecksum.file WHERE fileset = %s'", + (detection_id,), ) detection_files = cursor.fetchall() @@ -740,22 +751,26 @@ def merge_filesets(detection_id, dat_id): checktype = file[2] cursor.execute( - f"DELETE FROM file WHERE checksum = '{checksum}' AND fileset = {detection_id} LIMIT 1" + "DELETE FROM file WHERE checksum = %s AND fileset = %s LIMIT 1", + (checksum, detection_id), ) cursor.execute( - f"UPDATE file JOIN filechecksum ON filechecksum.file = file.id SET detection = TRUE, checksize = {checksize}, checktype = '{checktype}' WHERE fileset = '{dat_id}' AND filechecksum.checksum = '{checksum}'" + "UPDATE file JOIN filechecksum ON filechecksum.file = file.id SET detection = TRUE, checksize = %s, checktype = %s WHERE fileset = %s AND filechecksum.checksum = %s", + (checksize, checktype, dat_id, checksum), ) cursor.execute( - f"INSERT INTO history (`timestamp`, fileset, oldfileset) VALUES (FROM_UNIXTIME({int(time.time())}), {dat_id}, {detection_id})" + "INSERT INTO history (`timestamp`, fileset, oldfileset) VALUES (FROM_UNIXTIME(%s), %s, %s)", + (int(time.time()), dat_id, detection_id), ) cursor.execute("SELECT LAST_INSERT_ID()") history_last = cursor.fetchone()["LAST_INSERT_ID()"] cursor.execute( - f"UPDATE history SET fileset = {dat_id} WHERE fileset = {detection_id}" + "UPDATE history SET fileset = %s WHERE fileset = %s", + (dat_id, detection_id), ) - cursor.execute(f"DELETE FROM fileset WHERE id = {detection_id}") + cursor.execute("DELETE FROM fileset WHERE id = %s", (detection_id,)) conn.commit() except Exception as e: @@ -812,11 +827,13 @@ def populate_matching_games(): log_text = f"Matched game {matched_game['engineid']}:\n{matched_game['gameid']}-{matched_game['platform']}-{matched_game['language']}\nvariant {matched_game['key']}. State {status}. Fileset:{fileset[0][0]}." # Updating the fileset.game value to be $matched_game["id"] - query = f"UPDATE fileset SET game = {matched_game['id']}, status = '{status}', `key` = '{matched_game['key']}' WHERE id = {fileset[0][0]}" + query = "UPDATE fileset SET game = %s, status = %s, `key` = %s WHERE id = %s" history_last = merge_filesets(matched_game["fileset"], fileset[0][0]) - if cursor.execute(query): + if cursor.execute( + query, (matched_game["id"], status, matched_game["key"], fileset[0][0]) + ): user = f"cli:{getpass.getuser()}" create_log( @@ -835,7 +852,7 @@ def populate_matching_games(): # Add log id to the history table cursor.execute( - f"UPDATE history SET log = {log_last} WHERE id = {history_last}" + "UPDATE history SET log = %s WHERE id = %s", (log_last, history_last) ) try: @@ -873,7 +890,7 @@ def match_fileset(data_arr, username=None, skiplog=False): detection = src == "scummvm" source_status = "detection" if detection else src - conn.cursor().execute(f"SET @fileset_time_last = {int(time.time())}") + conn.cursor().execute("SET @fileset_time_last = %s", (int(time.time()),)) with conn.cursor() as cursor: cursor.execute("SELECT MAX(`transaction`) FROM transactions") @@ -1280,7 +1297,8 @@ def update_all_files(fileset, candidate_fileset_id, is_candidate_detection, conn with conn.cursor() as cursor: # Extracting the filename from the filepath. cursor.execute( - f"SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = {candidate_fileset_id}" + "SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = %s", + (candidate_fileset_id,), ) target_files = cursor.fetchall() candidate_files = { @@ -2417,13 +2435,13 @@ def find_matching_filesets(fileset, conn, status): checksize, checktype, checksum = get_checksum_props( checktype, checksum ) - query = f"""SELECT DISTINCT fs.id AS fileset_id + query = """SELECT DISTINCT fs.id AS fileset_id FROM fileset fs JOIN file f ON fs.id = f.fileset JOIN filechecksum fc ON f.id = fc.file - WHERE fc.checksum = '{checksum}' AND fc.checktype = '{checktype}' - AND fs.status IN ({state})""" - cursor.execute(query) + WHERE fc.checksum = %s AND fc.checktype = %s + AND fs.status IN (%s)""" + cursor.execute(query, (checksum, checktype, state)) records = cursor.fetchall() if records: for record in records: @@ -2446,16 +2464,16 @@ def matching_set(fileset, conn): checksum = checksum.split(":")[1] size = file["size"] - query = f""" + query = """ SELECT DISTINCT fs.id AS fileset_id FROM fileset fs JOIN file f ON fs.id = f.fileset JOIN filechecksum fc ON f.id = fc.file - WHERE fc.checksum = '{checksum}' AND fc.checktype LIKE 'md5%' - AND fc.checksize > {size} + WHERE fc.checksum = %s AND fc.checktype LIKE 'md5%' + AND fc.checksize > %s AND fs.status = 'detection' """ - cursor.execute(query) + cursor.execute(query, (checksum, size)) records = cursor.fetchall() if records: for record in records: @@ -2485,11 +2503,12 @@ def handle_matched_filesets( if is_full_matched: break cursor.execute( - f"SELECT status FROM fileset WHERE id = {matched_fileset_id}" + "SELECT status FROM fileset WHERE id = %s", (matched_fileset_id,) ) status = cursor.fetchone()["status"] cursor.execute( - f"SELECT COUNT(file.id) FROM file WHERE fileset = {matched_fileset_id}" + "SELECT COUNT(file.id) FROM file WHERE fileset = %s", + (matched_fileset_id,), ) count = cursor.fetchone()["COUNT(file.id)"] @@ -2535,28 +2554,31 @@ def handle_matched_filesets( def delete_original_fileset(fileset_id, conn): with conn.cursor() as cursor: - cursor.execute(f"DELETE FROM file WHERE fileset = {fileset_id}") - cursor.execute(f"DELETE FROM fileset WHERE id = {fileset_id}") + cursor.execute("DELETE FROM file WHERE fileset = %s", (fileset_id,)) + cursor.execute("DELETE FROM fileset WHERE id = %s", (fileset_id,)) conn.commit() def update_fileset_status(cursor, fileset_id, status): - cursor.execute(f""" + cursor.execute( + """ UPDATE fileset SET - status = '{status}', - `timestamp` = FROM_UNIXTIME({int(time.time())}) - WHERE id = {fileset_id} - """) + status = %s, + `timestamp` = FROM_UNIXTIME(%s) + WHERE id = %s + """, + (status, int(time.time()), fileset_id), + ) def populate_file(fileset, fileset_id, conn, detection): with conn.cursor() as cursor: - cursor.execute(f"SELECT * FROM file WHERE fileset = {fileset_id}") + cursor.execute("SELECT * FROM file WHERE fileset = %s", (fileset_id,)) target_files = cursor.fetchall() target_files_dict = {} for target_file in target_files: cursor.execute( - f"SELECT * FROM filechecksum WHERE file = {target_file['id']}" + "SELECT * FROM filechecksum WHERE file = %s", (target_file["id"],) ) target_checksums = cursor.fetchall() for checksum in target_checksums: @@ -2681,7 +2703,8 @@ def set_populate_file(fileset, fileset_id, conn, detection): with conn.cursor() as cursor: # Extracting the filename from the filepath. cursor.execute( - f"SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = {fileset_id}" + "SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = %s", + (fileset_id,), ) target_files = cursor.fetchall() candidate_files = { @@ -2723,23 +2746,17 @@ def set_populate_file(fileset, fileset_id, conn, detection): ): name = normalised_path(file["name"]) values = [name] - values.append(file["size"] if "size" in file else "0") values.append(file["size-r"] if "size-r" in file else "0") values.append(file["size-rd"] if "size-rd" in file else "0") - values.extend([checksum, fileset_id, detection, "None"]) - placeholders = ( - ["%s"] * (len(values[:5])) + ["%s"] + ["%s"] * 2 + ["NOW()"] - ) - query = f"INSERT INTO file ( name, size, `size-r`, `size-rd`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ({', '.join(placeholders)})" + query = "INSERT INTO file ( name, size, `size-r`, `size-rd`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, NOW())" cursor.execute(query, values) cursor.execute("SET @file_last = LAST_INSERT_ID()") cursor.execute("SELECT @file_last AS file_id") - cursor.execute("SELECT @file_last AS file_id") file_id = cursor.fetchone()["file_id"] insert_filechecksum(file, "md5", file_id, conn) @@ -2877,7 +2894,8 @@ def finalize_fileset_insertion( ): with conn.cursor() as cursor: cursor.execute( - f"SELECT COUNT(fileset) from transactions WHERE `transaction` = {transaction_id}" + "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s", + (transaction_id,), ) fileset_insertion_count = cursor.fetchone()["COUNT(fileset)"] category_text = f"Uploaded from {src}" @@ -2915,7 +2933,7 @@ def user_integrity_check(data, ip, game_metadata=None): print(f"Failed to connect to database: {e}") return - conn.cursor().execute(f"SET @fileset_time_last = {int(time.time())}") + conn.cursor().execute("SET @fileset_time_last = %s", (int(time.time()),)) try: with conn.cursor() as cursor: @@ -2940,12 +2958,13 @@ def user_integrity_check(data, ip, game_metadata=None): missing_set = set() for fileset_id in matched_map.keys(): - cursor.execute(f"SELECT * FROM file WHERE fileset = {fileset_id}") + cursor.execute("SELECT * FROM file WHERE fileset = %s", (fileset_id,)) target_files = cursor.fetchall() target_files_dict = {} for target_file in target_files: cursor.execute( - f"SELECT * FROM filechecksum WHERE file = {target_file['id']}" + "SELECT * FROM filechecksum WHERE file = %s", + (target_file["id"],), ) target_checksums = cursor.fetchall() for checksum in target_checksums: @@ -3025,12 +3044,13 @@ def user_integrity_check(data, ip, game_metadata=None): most_matched = matched_list[0] matched_fileset_id, matched_count = most_matched[0], most_matched[1] cursor.execute( - f"SELECT status FROM fileset WHERE id = {matched_fileset_id}" + "SELECT status FROM fileset WHERE id = %s", (matched_fileset_id,) ) status = cursor.fetchone()["status"] cursor.execute( - f"SELECT COUNT(file.id) FROM file WHERE fileset = {matched_fileset_id}" + "SELECT COUNT(file.id) FROM file WHERE fileset = %s", + (matched_fileset_id,), ) count = cursor.fetchone()["COUNT(file.id)"] if status == "full" and count == matched_count: @@ -3068,13 +3088,14 @@ def user_integrity_check(data, ip, game_metadata=None): def add_usercount(fileset, conn): with conn.cursor() as cursor: cursor.execute( - f"UPDATE fileset SET user_count = COALESCE(user_count, 0) + 1 WHERE id = {fileset}" + "UPDATE fileset SET user_count = COALESCE(user_count, 0) + 1 WHERE id = %s", + (fileset,), ) - cursor.execute(f"SELECT user_count from fileset WHERE id = {fileset}") + cursor.execute("SELECT user_count from fileset WHERE id = %s", (fileset,)) count = cursor.fetchone()["user_count"] if count >= 3: cursor.execute( - f"UPDATE fileset SET status = 'ReadyForReview' WHERE id = {fileset}" + "UPDATE fileset SET status = 'ReadyForReview' WHERE id = %s", (fileset,) )