diff --git a/clear.py b/clear.py
index 707914c8..acdae141 100644
--- a/clear.py
+++ b/clear.py
@@ -7,26 +7,30 @@
import json
import os
+
def truncate_all_tables(conn):
+ # fmt: off
tables = ["filechecksum", "queue", "history", "transactions", "file", "fileset", "game", "engine", "log"]
cursor = conn.cursor()
-
+ # fmt: on
+
# Disable foreign key checks
cursor.execute("SET FOREIGN_KEY_CHECKS = 0")
-
+
for table in tables:
try:
- cursor.execute(f"TRUNCATE TABLE `{table}`")
+ cursor.execute("TRUNCATE TABLE %s", (table,))
print(f"Table '{table}' truncated successfully")
except pymysql.Error as err:
print(f"Error truncating table '{table}': {err}")
-
+
# Enable foreign key checks
cursor.execute("SET FOREIGN_KEY_CHECKS = 1")
+
if __name__ == "__main__":
base_dir = os.path.dirname(os.path.abspath(__file__))
- config_path = os.path.join(base_dir, 'mysql_config.json')
+ config_path = os.path.join(base_dir, "mysql_config.json")
with open(config_path) as f:
mysql_cred = json.load(f)
@@ -41,9 +45,9 @@ def truncate_all_tables(conn):
user=username,
password=password,
db=dbname, # Specify the database to use
- charset='utf8mb4',
+ charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
- autocommit=True
+ autocommit=True,
)
# Check connection
@@ -55,4 +59,4 @@ def truncate_all_tables(conn):
truncate_all_tables(conn)
# Close connection
- conn.close()
\ No newline at end of file
+ conn.close()
diff --git a/compute_hash.py b/compute_hash.py
index 0067cfdc..d63b22fb 100644
--- a/compute_hash.py
+++ b/compute_hash.py
@@ -4,6 +4,8 @@
import struct
import sys
from enum import Enum
+from datetime import datetime, date, timedelta
+from collections import defaultdict
class FileType(Enum):
NON_MAC = "non_mac"
@@ -16,6 +18,8 @@ class FileType(Enum):
script_version = "0.1"
+SPECIAL_SYMBOLS = '/":*|\\?%<>\x7f'
+
# CRC table
CRC16_XMODEM_TABLE = [
0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
@@ -73,9 +77,73 @@ def get_dirs_at_depth(directory, depth):
if depth == num_sep_this - num_sep:
yield root
-def read_be_32(byte_stream):
+
+def my_escape_string(s: str) -> str:
+ """
+ Escape strings
+
+ Escape the following:
+ - escape char: \x81
+ - unallowed filename chars: https://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words
+ - control chars < 0x20
+ """
+ new_name = ""
+ for char in s:
+ if char == "\x81":
+ new_name += "\x81\x79"
+ elif char in SPECIAL_SYMBOLS or ord(char) < 0x20:
+ new_name += "\x81" + chr(0x80 + ord(char))
+ else:
+ new_name += char
+ return new_name
+
+
+def encode_punycode(orig):
+ """
+ Punyencode strings
+
+ - escape special characters and
+ - ensure filenames can't end in a space or dotif temp == None:
+ """
+ s = my_escape_string(orig)
+ encoded = s.encode("punycode").decode("ascii")
+ # punyencoding adds an '-' at the end when there are no special chars
+ # don't use it for comparing
+ compare = encoded
+ if encoded.endswith("-"):
+ compare = encoded[:-1]
+ if orig != compare or compare[-1] in " .":
+ return "xn--" + encoded
+ return orig
+
+
+def punycode_need_encode(orig):
+ """
+ A filename needs to be punyencoded when it:
+
+ - contains a char that should be escaped or
+ - ends with a dot or a space.
+ """
+ if len(orig) > 4 and orig[:4] == "xn--":
+ return False
+ if not all((0x20 <= ord(c) < 0x80) and c not in SPECIAL_SYMBOLS for c in orig):
+ return True
+ if orig[-1] in " .":
+ return True
+ return False
+
+def encode_path_components(filepath):
+ """
+ Puny encodes all separate components of filepath
+ """
+ parts = [i for i in filepath.split(os.sep) if i ]
+ encoded_parts = [encode_punycode(p) if punycode_need_encode(p) else p for p in parts]
+ return os.path.join(*encoded_parts)
+
+def read_be_32(byte_stream, signed=False):
""" Return unsigned integer of size_in_bits, assuming the data is big-endian """
- (uint,) = struct.unpack(">I", byte_stream[:32//8])
+ format = ">i" if signed else ">I"
+ (uint,) = struct.unpack(format, byte_stream[:32//8])
return uint
def read_be_16(byte_stream):
@@ -154,7 +222,6 @@ def is_actual_resource_fork_mac(filepath):
""" Returns boolean, checking the actual mac fork if it exists. """
resource_fork_path = os.path.join(filepath, "..namedfork", "rsrc")
- print(resource_fork_path)
return os.path.exists(resource_fork_path)
def is_appledouble(file_byte_stream):
@@ -178,7 +245,7 @@ def is_appledouble(file_byte_stream):
return True
def macbin_get_resfork_data(file_byte_stream):
- """ Returns the resource fork's data section as bytes of a macbinary file as well as its size """
+ """ Returns the resource fork's data section as bytes, data fork size (size), resource fork size (size-r) and data section of resource fork size (size-rd) of a macbinary file """
if not file_byte_stream:
return file_byte_stream
@@ -188,10 +255,10 @@ def macbin_get_resfork_data(file_byte_stream):
(rsrclen,) = struct.unpack(">I", file_byte_stream[0x57:0x5B])
resoure_fork_offset = 128 + datalen_padded
- data_offset = int.from_bytes(file_byte_stream[resoure_fork_offset+0 : resoure_fork_offset+4])
- data_length = int.from_bytes(file_byte_stream[resoure_fork_offset+8 : resoure_fork_offset+12])
+ rd_offset = int.from_bytes(file_byte_stream[resoure_fork_offset+0 : resoure_fork_offset+4])
+ rd_length = int.from_bytes(file_byte_stream[resoure_fork_offset+8 : resoure_fork_offset+12])
- return (file_byte_stream[resoure_fork_offset + data_offset: resoure_fork_offset + data_offset + data_length], data_length)
+ return (file_byte_stream[resoure_fork_offset + rd_offset: resoure_fork_offset + rd_offset + rd_length], datalen, rsrclen, rd_length)
def macbin_get_datafork(file_byte_stream):
if not file_byte_stream:
@@ -200,28 +267,9 @@ def macbin_get_datafork(file_byte_stream):
(datalen,) = struct.unpack(">I", file_byte_stream[0x53:0x57])
return file_byte_stream[0x80: 0x80 + datalen]
-def is_appledouble(file_byte_stream):
- """
- Appledouble Structure -
-
- Header:
- +$00 / 4: signature (0x00 0x05 0x16 0x00)
- +$04 / 4: version (0x00 0x01 0x00 0x00 (v1) -or- 0x00 0x02 0x00 0x00 (v2))
- +$08 /16: home file system string (v1) -or- zeroes (v2)
- +$18 / 2: number of entries
-
- Entries:
- +$00 / 4: entry ID (1-15)
- +$04 / 4: offset to data from start of file
- +$08 / 4: length of entry in bytes; may be zero
- """
- if (not file_byte_stream or read_be_32(file_byte_stream) != 0x00051607):
- return False
-
- return True
def appledouble_get_resfork_data(file_byte_stream):
- """ Returns the resource fork's data section as bytes of an appledouble file as well as its size """
+ """ Returns the resource fork's data section as bytes, size of resource fork (size-r) and size of data section of resource fork (size-rd) of an appledouble file"""
entry_count = read_be_16(file_byte_stream[24:])
for entry in range(entry_count):
@@ -232,13 +280,13 @@ def appledouble_get_resfork_data(file_byte_stream):
if id == 2:
resource_fork_stream = file_byte_stream[offset:offset+length]
- data_offset = int.from_bytes(resource_fork_stream[0:4])
- data_length = int.from_bytes(resource_fork_stream[8:12])
+ rd_offset = int.from_bytes(resource_fork_stream[0:4])
+ rd_length = int.from_bytes(resource_fork_stream[8:12])
- return (resource_fork_stream[data_offset: data_offset+data_length], data_length)
+ return (resource_fork_stream[rd_offset: rd_offset+rd_length], length, rd_length)
def appledouble_get_datafork(filepath, fileinfo):
- """ Returns data fork's content as bytes of appledouble file if found, otherwise empty byte string """
+ """ Returns data fork's content as bytes and size of data fork of an appledouble file."""
try:
index = filepath.index("__MACOSX")
except ValueError:
@@ -252,50 +300,54 @@ def appledouble_get_datafork(filepath, fileinfo):
try:
with open(data_fork_path, "rb") as f:
- return f.read()
+ data = f.read()
+ return (data, len(data))
except (FileNotFoundError, IsADirectoryError):
return b''
def raw_rsrc_get_datafork(filepath):
- """ Returns the data fork's content as bytes corresponding to raw rsrc file. """
+ """ Returns the data fork's content as bytes and size of the data fork corresponding to raw rsrc file. """
try:
with open(filepath[:-5]+".data", "rb") as f:
- return f.read()
+ data = f.read()
+ return (data, len(data))
except (FileNotFoundError, IsADirectoryError):
return b''
def raw_rsrc_get_resource_fork_data(filepath):
- """ Returns the resource fork's data section as bytes of a raw rsrc file as well as its size """
+ """ Returns the resource fork's data section as bytes, size of resource fork (size-r) and size of data section of resource fork (size-rd) of a raw rsrc file."""
with open(filepath, "rb") as f:
resource_fork_stream = f.read()
- data_offset = int.from_bytes(resource_fork_stream[0:4])
- data_length = int.from_bytes(resource_fork_stream[8:12])
+ resource_fork_len = len(resource_fork_stream)
+ rd_offset = int.from_bytes(resource_fork_stream[0:4])
+ rd_length = int.from_bytes(resource_fork_stream[8:12])
- return (resource_fork_stream[data_offset: data_offset+data_length], data_length)
+ return (resource_fork_stream[rd_offset: rd_offset+rd_length], resource_fork_len, rd_length)
def actual_mac_fork_get_data_fork(filepath):
- """ Returns the data fork's content as bytes if the actual mac fork exists """
+ """ Returns the data fork's content as bytes and its size if the actual mac fork exists """
try:
with open(filepath, "rb") as f:
- return f.read()
+ data = f.read()
+ return (data, len(data))
except (FileNotFoundError, IsADirectoryError):
return b''
def actual_mac_fork_get_resource_fork_data(filepath):
- """ Returns the resource fork's data section as bytes of the actual mac fork as well as its size """
+ """ Returns the resource fork's data section as bytes, size of resource fork (size-r) and size of data section of resource fork (size-rd) of the actual mac fork."""
resource_fork_path = os.path.join(filepath, "..namedfork", "rsrc")
with open(resource_fork_path, "rb") as f:
resource_fork_stream = f.read()
- data_offset = int.from_bytes(resource_fork_stream[0:4])
- data_length = int.from_bytes(resource_fork_stream[8:12])
+ resource_fork_len = len(resource_fork_stream)
+ rd_offset = int.from_bytes(resource_fork_stream[0:4])
+ rd_length = int.from_bytes(resource_fork_stream[8:12])
- return (resource_fork_stream[data_offset: data_offset+data_length], data_length)
+ return (resource_fork_stream[rd_offset: rd_offset+rd_length], resource_fork_len, rd_length)
-def file_checksum(filepath, alg, size, file_info):
- cur_file_size = 0
+def file_checksum(filepath, alg, custom_checksum_size, file_info):
with open(filepath, "rb") as f:
if file_info[0] == FileType.NON_MAC:
- return (create_checksum_pairs(checksum(f, alg, size, filepath), alg, size), filesize(filepath))
+ return (create_checksum_pairs(checksum(f, alg, custom_checksum_size, filepath), alg, custom_checksum_size), filesize(filepath), 0, 0)
# Processing mac files
res = []
@@ -303,29 +355,33 @@ def file_checksum(filepath, alg, size, file_info):
datafork = b''
file_data = f.read()
+ size = 0
+ size_r = 0
+ size_rd = 0
+
if file_info[0] == FileType.MAC_BINARY:
- (resfork, cur_file_size) = macbin_get_resfork_data(file_data)
+ (resfork, size, size_r, size_rd) = macbin_get_resfork_data(file_data)
datafork = macbin_get_datafork(file_data)
elif file_info[0] in {FileType.APPLE_DOUBLE_DOT_, FileType.APPLE_DOUBLE_RSRC, FileType.APPLE_DOUBLE_MACOSX}:
- (resfork, cur_file_size) = appledouble_get_resfork_data(file_data)
- datafork = appledouble_get_datafork(filepath, file_info)
+ (resfork, size_r, size_rd) = appledouble_get_resfork_data(file_data)
+ (datafork, size) = appledouble_get_datafork(filepath, file_info)
elif file_info[0] == FileType.RAW_RSRC:
- (resfork, cur_file_size) = raw_rsrc_get_resource_fork_data(filepath)
- datafork = raw_rsrc_get_datafork(filepath)
+ (resfork, size_r, size_rd) = raw_rsrc_get_resource_fork_data(filepath)
+ datafork, size = raw_rsrc_get_datafork(filepath)
elif file_info[0] == FileType.ACTUAL_FORK_MAC:
- (resfork, cur_file_size) = actual_mac_fork_get_resource_fork_data(filepath)
- datafork = actual_mac_fork_get_data_fork(filepath)
+ (resfork, size_r, size_rd) = actual_mac_fork_get_resource_fork_data(filepath)
+ (datafork, size) = actual_mac_fork_get_data_fork(filepath)
- hashes = checksum(resfork, alg, size, filepath)
+ hashes = checksum(resfork, alg, custom_checksum_size, filepath)
prefix = 'r'
if len(resfork):
- res.extend(create_checksum_pairs(hashes, alg, size, prefix))
+ res.extend(create_checksum_pairs(hashes, alg, custom_checksum_size, prefix))
- hashes = checksum(datafork, alg, size, filepath)
+ hashes = checksum(datafork, alg, custom_checksum_size, filepath)
prefix = 'd'
- res.extend(create_checksum_pairs(hashes, alg, size, prefix))
+ res.extend(create_checksum_pairs(hashes, alg, custom_checksum_size, prefix))
- return (res, cur_file_size)
+ return (res, size, size_r, size_rd)
def create_checksum_pairs(hashes, alg, size, prefix=None):
res = []
@@ -505,7 +561,7 @@ def file_filter(files):
for file in to_be_deleted:
del files[file]
-def compute_hash_of_dirs(root_directory, depth, size=0, alg="md5"):
+def compute_hash_of_dirs(root_directory, depth, size=0, limit_timestamps_date=None, alg="md5"):
""" Return dictionary containing checksums of all files in directory """
res = []
@@ -518,8 +574,11 @@ def compute_hash_of_dirs(root_directory, depth, size=0, alg="md5"):
for root, _, contents in os.walk(directory):
files.extend([os.path.join(root, f) for f in contents])
+ # Filter out the files based on user input date - limit_timestamps_date
+ filtered_file_map = filter_files_by_timestamp(files, limit_timestamp_date)
+
# Produce filetype and filename(name to be used in game entry) for each file
- for filepath in files:
+ for filepath in filtered_file_map:
file_collection[filepath] = file_classification(filepath)
# Remove extra entries of macfiles to avoid extra checksum calculation in form of non mac files
@@ -538,11 +597,113 @@ def compute_hash_of_dirs(root_directory, depth, size=0, alg="md5"):
relative_dir = os.path.dirname(os.path.dirname(relative_path))
relative_path = os.path.join(relative_dir, base_name)
- hash_of_dir[relative_path] = file_checksum(file_path, alg, size, file_info)
+ hash_of_dir[relative_path] = file_checksum(file_path, alg, size, file_info) + (filtered_file_map[file_path],)
res.append(hash_of_dir)
return res
+
+def extract_macbin_mtime(file_byte_stream):
+ """
+ Returns modification time of macbinary file from the header.
+ Doc - +$5f / 4: modification date/time.
+ Doc - Timestamps are unsigned 32-bit values indicating the time in seconds since midnight on Jan 1, 1904, in local time.
+ """
+ macbin_epoch = datetime(1904, 1, 1)
+ header = file_byte_stream[:128]
+ macbin_seconds = read_be_32(header[0x5f:])
+ return (macbin_epoch + timedelta(seconds=macbin_seconds)).date()
+
+
+def extract_mtime_appledouble(file_byte_stream):
+ """
+ Returns modification time of appledouble file.
+ Doc 1 - The File Dates Info entry (ID=8) consists of the file creation, modification, backup
+ and access times (see Figure 2-1), stored as a signed number of seconds before
+ or after 12:00 a.m. (midnight), January 1, 2000 Greenwich Mean Time (GMT)
+
+ Doc 2 -
+ struct ASFileDates /* entry ID 8, file dates info */
+ {
+ sint32 create; /* file creation date/time */
+ sint32 modify; /* last modification date/time */
+ sint32 backup; /* last backup date/time */
+ sint32 access; /* last access date/time */
+ }; /* ASFileDates */
+ """
+ entry_count = read_be_16(file_byte_stream[24:])
+ for entry in range(entry_count):
+ start_index = 26 + entry*12
+ id = read_be_32(file_byte_stream[start_index:])
+ offset = read_be_32(file_byte_stream[start_index+4:])
+ length = read_be_32(file_byte_stream[start_index+8:])
+
+ if id == 8:
+ date_info_data = file_byte_stream[offset:offset + length]
+ if len(date_info_data) < 16:
+ raise ValueError("FileDatesInfo block is too short.")
+ appledouble_epoch = datetime(2000, 1, 1)
+ modify_seconds = read_be_32(date_info_data[4:8], signed=True)
+ return (appledouble_epoch + timedelta(seconds=modify_seconds)).date()
+
+ return None
+
+
+def macfile_timestamp(filepath):
+ """
+ Returns the modification times for the mac file from their finderinfo.
+ If the file is not a macfile, it returns None
+ """
+ with open(filepath, "rb") as f:
+ data = f.read()
+ # Macbinary
+ if is_macbin(filepath):
+ return extract_macbin_mtime(data)
+
+ # Appledouble
+ if is_appledouble_rsrc(filepath) or is_appledouble_in_dot_(filepath) or is_appledouble_in_macosx(filepath):
+ return extract_mtime_appledouble(data)
+
+ return None
+
+
+def validate_date(date_str):
+ """
+ Confirms if the user provided timestamp is in a valid format.
+ Returns the date as a datetime object.
+ """
+ formats = ["%Y-%m-%d", "%Y-%m", "%Y"]
+ for fmt in formats:
+ try:
+ return datetime.strptime(date_str, fmt).date()
+ except ValueError:
+ continue
+ raise ValueError("Invalid date format. Use YYYY, YYYY-MM, or YYYY-MM-DD")
+
+
+def filter_files_by_timestamp(files, limit_timestamps_date):
+ """
+ Removes the files those were modified after a certain timestamp provided by the user.
+ The files those were modified today are kept.
+ Returns filtered map with filepath and its modification time
+ """
+
+ filtered_file_map = defaultdict(str)
+
+ if limit_timestamp_date is not None:
+ user_date = validate_date(limit_timestamps_date)
+ today = date.today()
+
+ for filepath in files:
+ mtime = macfile_timestamp(filepath)
+ if mtime is None:
+ mtime = datetime.fromtimestamp(os.path.getmtime(filepath)).date()
+ if limit_timestamps_date is None or (limit_timestamps_date is not None and (mtime <= user_date or mtime == today)):
+ filtered_file_map[filepath] = str(mtime)
+
+ return filtered_file_map
+
+
def create_dat_file(hash_of_dirs, path, checksum_size=0):
with open(f"{os.path.basename(path)}.dat", "w") as file:
# Header
@@ -556,8 +717,9 @@ def create_dat_file(hash_of_dirs, path, checksum_size=0):
# Game files
for hash_of_dir in hash_of_dirs:
file.write("game (\n")
- for filename, (hashes, filesize) in hash_of_dir.items():
- data = f"name \"{filename}\" size {filesize}"
+ for filename, (hashes, size, size_r, size_rd, timestamp) in hash_of_dir.items():
+ filename = encode_path_components(filename)
+ data = f"name '{filename}' size {size} size-r {size_r} size-rd {size_rd} modification-time {timestamp}"
for key, value in hashes:
data += f" {key} {value}"
@@ -579,10 +741,13 @@ def error(self, message):
help="Depth from root to game directories")
parser.add_argument("--size",
help="Use first n bytes of file to calculate checksum")
+parser.add_argument("--limit-timestamps",
+ help="Format - YYYY-MM-DD or YYYY-MM or YYYY. Filters out the files those were modified after the given timestamp. Note that if the modification time is today, it would not be filtered out.")
args = parser.parse_args()
path = os.path.abspath(args.directory) if args.directory else os.getcwd()
depth = int(args.depth) if args.depth else 0
checksum_size = int(args.size) if args.size else 0
+limit_timestamp_date = str(args.limit_timestamps) if args.limit_timestamps else None
create_dat_file(compute_hash_of_dirs(
- path, depth, checksum_size), path, checksum_size)
+ path, depth, checksum_size, limit_timestamp_date), path, checksum_size)
diff --git a/dat_parser.py b/dat_parser.py
index b3ce12ef..a76480b2 100644
--- a/dat_parser.py
+++ b/dat_parser.py
@@ -33,6 +33,9 @@ def map_checksum_data(content_string):
elif tokens[i] == "size-rd":
current_rom["size-rd"] = int(tokens[i + 1])
i += 2
+ elif tokens[i] == "modification-time":
+ current_rom["modification-time"] = tokens[i + 1]
+ i += 2
else:
checksum_key = tokens[i]
checksum_value = tokens[i + 1] if len(tokens) >= 6 else "0"
diff --git a/db_functions.py b/db_functions.py
index 45adc5de..490bcebb 100644
--- a/db_functions.py
+++ b/db_functions.py
@@ -9,11 +9,13 @@
from collections import defaultdict
import re
import copy
+import sys
SPECIAL_SYMBOLS = '/":*|\\?%<>\x7f'
def db_connect():
+ console_log("Connecting to the Database.")
base_dir = os.path.dirname(os.path.abspath(__file__))
config_path = os.path.join(base_dir, "mysql_config.json")
with open(config_path) as f:
@@ -28,10 +30,28 @@ def db_connect():
cursorclass=pymysql.cursors.DictCursor,
autocommit=False,
)
-
+ console_log(f"Connected to Database - {mysql_cred['dbname']}")
return conn
+def db_connect_root():
+ base_dir = os.path.dirname(os.path.abspath(__file__))
+ config_path = os.path.join(base_dir, "mysql_config.json")
+ with open(config_path) as f:
+ mysql_cred = json.load(f)
+
+ conn = pymysql.connect(
+ host=mysql_cred["servername"],
+ user=mysql_cred["username"],
+ password=mysql_cred["password"],
+ charset="utf8mb4",
+ cursorclass=pymysql.cursors.DictCursor,
+ autocommit=True,
+ )
+
+ return (conn, mysql_cred["dbname"])
+
+
def get_checksum_props(checkcode, checksum):
checksize = 0
checktype = checkcode
@@ -59,24 +79,26 @@ def insert_game(engine_name, engineid, title, gameid, extra, platform, lang, con
# Set @engine_last if engine already present in table
exists = False
with conn.cursor() as cursor:
- cursor.execute(f"SELECT id FROM engine WHERE engineid = '{engineid}'")
+ cursor.execute("SELECT id FROM engine WHERE engineid = %s", (engineid,))
res = cursor.fetchone()
if res is not None:
exists = True
- cursor.execute(f"SET @engine_last = '{res['id']}'")
+ cursor.execute("SET @engine_last = %s", (res["id"],))
# Insert into table if not present
if not exists:
with conn.cursor() as cursor:
cursor.execute(
- f"INSERT INTO engine (name, engineid) VALUES ('{escape_string(engine_name)}', '{engineid}')"
+ "INSERT INTO engine (name, engineid) VALUES (%s, %s)",
+ (engine_name, engineid),
)
cursor.execute("SET @engine_last = LAST_INSERT_ID()")
# Insert into game
with conn.cursor() as cursor:
cursor.execute(
- f"INSERT INTO game (name, engine, gameid, extra, platform, language) VALUES ('{escape_string(title)}', @engine_last, '{gameid}', '{escape_string(extra)}', '{platform}', '{lang}')"
+ "INSERT INTO game (name, engine, gameid, extra, platform, language) VALUES (%s, @engine_last, %s, %s, %s, %s)",
+ (title, gameid, extra, platform, lang),
)
cursor.execute("SET @game_last = LAST_INSERT_ID()")
@@ -89,6 +111,7 @@ def insert_fileset(
transaction,
log_text,
conn,
+ set_dat_metadata="",
ip="",
username=None,
skiplog=None,
@@ -108,27 +131,27 @@ def insert_fileset(
# Check if key/megakey already exists, if so, skip insertion (no quotes on purpose)
if detection:
with conn.cursor() as cursor:
- cursor.execute(f"SELECT id FROM fileset WHERE megakey = {megakey}")
+ cursor.execute("SELECT id FROM fileset WHERE megakey = %s", (megakey,))
existing_entry = cursor.fetchone()
else:
with conn.cursor() as cursor:
- cursor.execute(f"SELECT id FROM fileset WHERE `key` = {key}")
+ cursor.execute("SELECT id FROM fileset WHERE `key` = %s", (key,))
existing_entry = cursor.fetchone()
if existing_entry is not None:
existing_entry = existing_entry["id"]
with conn.cursor() as cursor:
- cursor.execute(f"SET @fileset_last = {existing_entry}")
- cursor.execute(f"DELETE FROM file WHERE fileset = {existing_entry}")
+ cursor.execute("SET @fileset_last = %s", (existing_entry,))
+ cursor.execute("DELETE FROM file WHERE fileset = %s", (existing_entry,))
cursor.execute(
- f"UPDATE fileset SET `timestamp` = FROM_UNIXTIME(@fileset_time_last) WHERE id = {existing_entry}"
+ "UPDATE fileset SET `timestamp` = FROM_UNIXTIME(@fileset_time_last) WHERE id = %s",
+ (existing_entry,),
)
cursor.execute(
- f"UPDATE fileset SET status = 'detection' WHERE id = {existing_entry} AND status = 'obsolete'"
+ "SELECT status FROM fileset WHERE id = %s", (existing_entry,)
)
- cursor.execute(f"SELECT status FROM fileset WHERE id = {existing_entry}")
status = cursor.fetchone()["status"]
if status == "user":
add_usercount(existing_entry, conn)
@@ -144,10 +167,10 @@ def insert_fileset(
return (existing_entry, True)
# $game and $key should not be parsed as a mysql string, hence no quotes
- query = f"INSERT INTO fileset (game, status, src, `key`, megakey, `timestamp`) VALUES ({game}, '{status}', '{src}', {key}, {megakey}, FROM_UNIXTIME(@fileset_time_last))"
+ query = "INSERT INTO fileset (game, status, src, `key`, megakey, `timestamp`, set_dat_metadata) VALUES (%s, %s, %s, %s, %s, FROM_UNIXTIME(@fileset_time_last), %s)"
fileset_id = -1
with conn.cursor() as cursor:
- cursor.execute(query)
+ cursor.execute(query, (game, status, src, key, megakey, set_dat_metadata))
fileset_id = cursor.lastrowid
cursor.execute("SET @fileset_last = LAST_INSERT_ID()")
@@ -170,7 +193,8 @@ def insert_fileset(
update_history(0, fileset_last, conn)
with conn.cursor() as cursor:
cursor.execute(
- f"INSERT INTO transactions (`transaction`, fileset) VALUES ({transaction}, {fileset_last})"
+ "INSERT INTO transactions (`transaction`, fileset) VALUES (%s, %s)",
+ (transaction, fileset_last),
)
return (fileset_id, False)
@@ -212,14 +236,11 @@ def insert_file(file, detection, src, conn):
values.append(file["size"] if "size" in file else "0")
values.append(file["size-r"] if "size-r" in file else "0")
values.append(file["size-rd"] if "size-rd" in file else "0")
-
+ values.append(file["modification-time"] if "modification-time" in file else "")
values.extend([checksum, detection, detection_type])
# Parameterised Query
- placeholders = (
- ["%s"] * (len(values[:5])) + ["@fileset_last"] + ["%s"] * 2 + ["NOW()"]
- )
- query = f"INSERT INTO file ( name, size, `size-r`, `size-rd`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ({', '.join(placeholders)})"
+ query = "INSERT INTO file ( name, size, `size-r`, `size-rd`, `modification-time`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES (%s, %s, %s, %s, %s, %s, @fileset_last, %s, %s, NOW())"
with conn.cursor() as cursor:
cursor.execute(query, values)
@@ -227,32 +248,39 @@ def insert_file(file, detection, src, conn):
if detection:
with conn.cursor() as cursor:
cursor.execute(
- f"UPDATE fileset SET detection_size = {checksize} WHERE id = @fileset_last AND detection_size IS NULL"
+ "UPDATE fileset SET detection_size = %s WHERE id = @fileset_last AND detection_size IS NULL",
+ (checksize,),
)
with conn.cursor() as cursor:
cursor.execute("SET @file_last = LAST_INSERT_ID()")
-def insert_filechecksum(file, checktype, conn):
+def insert_filechecksum(file, checktype, file_id, conn):
if checktype not in file:
return
checksum = file[checktype]
checksize, checktype, checksum = get_checksum_props(checktype, checksum)
- query = f"INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (@file_last, '{checksize}', '{checktype}', '{checksum}')"
+ query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)"
+ with conn.cursor() as cursor:
+ cursor.execute(query, (file_id, checksize, checktype, checksum))
+
+ add_all_equal_checksums(checksize, checktype, checksum, file_id, conn)
+
+
+def add_all_equal_checksums(checksize, checktype, checksum, file_id, conn):
+ """
+ We can update all the checksums when file size is less than the checksum size type, as all checksums are equal in that case.
+ """
with conn.cursor() as cursor:
- cursor.execute(query)
if "md5" not in checktype:
return
-
size_name = "size"
if checktype[-1] == "r":
size_name += "-rd"
- if checktype[-1] == "s":
- size_name += "-d"
- cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = @file_last")
+ cursor.execute(f"SELECT `{size_name}` FROM file WHERE id = %s", (file_id,))
result = cursor.fetchone()
if not result:
return
@@ -281,9 +309,10 @@ def insert_filechecksum(file, checktype, conn):
checksum_size = exploded.pop()
checksum_type = "-".join(exploded)
- query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (@file_last, %s, %s, %s)"
- with conn.cursor() as cursor:
- cursor.execute(query, (checksum_size, checksum_type, checksum))
+ query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)"
+ cursor.execute(
+ query, (file_id, checksum_size, checksum_type, checksum)
+ )
def delete_filesets(conn):
@@ -347,9 +376,10 @@ def punycode_need_encode(orig):
def create_log(category, user, text, conn):
query = f"INSERT INTO log (`timestamp`, category, user, `text`) VALUES (FROM_UNIXTIME({int(time.time())}), '{escape_string(category)}', '{escape_string(user)}', '{escape_string(text)}')"
+ query = "INSERT INTO log (`timestamp`, category, user, `text`) VALUES (FROM_UNIXTIME(%s), %s, %s, %s)"
with conn.cursor() as cursor:
try:
- cursor.execute(query)
+ cursor.execute(query, (int(time.time()), category, user, text))
conn.commit()
except Exception as e:
conn.rollback()
@@ -362,10 +392,12 @@ def create_log(category, user, text, conn):
def update_history(source_id, target_id, conn, log_last=None):
- query = f"INSERT INTO history (`timestamp`, fileset, oldfileset, log) VALUES (NOW(), {target_id}, {source_id}, {log_last if log_last is not None else 0})"
+ query = "INSERT INTO history (`timestamp`, fileset, oldfileset, log) VALUES (NOW(), %s, %s, %s)"
with conn.cursor() as cursor:
try:
- cursor.execute(query)
+ cursor.execute(
+ query, (target_id, source_id, log_last if log_last is not None else 0)
+ )
conn.commit()
except Exception as e:
conn.rollback()
@@ -390,7 +422,8 @@ def get_all_related_filesets(fileset_id, conn, visited=None):
try:
with conn.cursor() as cursor:
cursor.execute(
- f"SELECT fileset, oldfileset FROM history WHERE fileset = {fileset_id} OR oldfileset = {fileset_id}"
+ "SELECT fileset, oldfileset FROM history WHERE fileset = %s OR oldfileset = %s",
+ (fileset_id, fileset_id),
)
history_records = cursor.fetchall()
@@ -488,7 +521,7 @@ def db_insert(data_arr, username=None, skiplog=False):
detection = src == "scummvm"
status = "detection" if detection else src
- conn.cursor().execute(f"SET @fileset_time_last = {int(time.time())}")
+ conn.cursor().execute("SET @fileset_time_last = %s", (int(time.time()),))
with conn.cursor() as cursor:
cursor.execute("SELECT MAX(`transaction`) FROM transactions")
@@ -498,23 +531,34 @@ def db_insert(data_arr, username=None, skiplog=False):
transaction_id = temp + 1
category_text = f"Uploaded from {src}"
- log_text = f"Started loading DAT file, size {os.path.getsize(filepath)}, author {author}, version {version}. State {status}. Transaction: {transaction_id}"
+ log_text = f"Started loading DAT file {filepath}, size {os.path.getsize(filepath)}, author {author}, version {version}. State {status}. Transaction: {transaction_id}"
user = f"cli:{getpass.getuser()}" if username is None else username
create_log(escape_string(category_text), user, escape_string(log_text), conn)
+ console_log(log_text)
+ console_log_total_filesets(filepath)
+
+ fileset_count = 1
for fileset in game_data:
+ console_log_detection(fileset_count)
key = calc_key(fileset)
megakey = calc_megakey(fileset)
if detection:
- engine_name = fileset["engine"]
- engineid = fileset["sourcefile"]
- gameid = fileset["name"]
- title = fileset["title"]
- extra = fileset["extra"]
- platform = fileset["platform"]
- lang = fileset["language"]
+ try:
+ engine_name = fileset.get("engine", "")
+ engineid = fileset["sourcefile"]
+ gameid = fileset["name"]
+ title = fileset.get("title", "")
+ extra = fileset.get("extra", "")
+ platform = fileset.get("platform", "")
+ lang = fileset.get("language", "")
+ except KeyError as e:
+ print(
+ f"Missing key in header: {e} for {fileset.get('name', '')}-{fileset.get('language', '')}-{fileset.get('platform', '')}"
+ )
+ return
with conn.cursor() as cursor:
query = """
@@ -527,7 +571,7 @@ def db_insert(data_arr, username=None, skiplog=False):
if existing_entry is not None:
log_text = f"Skipping Entry as similar entry already exsits - Fileset:{existing_entry['id']}. Skpped entry details - engineid = {engineid}, gameid = {gameid}, platform = {platform}, language = {lang}"
create_log("Warning", user, escape_string(log_text), conn)
- print(log_text)
+ console_log(log_text)
continue
insert_game(
@@ -558,23 +602,27 @@ def db_insert(data_arr, username=None, skiplog=False):
for file in unique_files:
insert_file(file, detection, src, conn)
+ file_id = None
+ with conn.cursor() as cursor:
+ cursor.execute("SELECT @file_last AS file_id")
+ file_id = cursor.fetchone()["file_id"]
for key, value in file.items():
if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]:
- insert_filechecksum(file, key, conn)
+ insert_filechecksum(file, key, file_id, conn)
+
+ fileset_count += 1
- if detection:
- conn.cursor().execute(
- "UPDATE fileset SET status = 'obsolete' WHERE `timestamp` != FROM_UNIXTIME(@fileset_time_last) AND status = 'detection'"
- )
cur = conn.cursor()
try:
cur.execute(
- f"SELECT COUNT(fileset) from transactions WHERE `transaction` = {transaction_id}"
+ "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s",
+ (transaction_id,),
)
fileset_insertion_count = cur.fetchone()["COUNT(fileset)"]
category_text = f"Uploaded from {src}"
log_text = f"Completed loading DAT file, filename {filepath}, size {os.path.getsize(filepath)}, author {author}, version {version}. State {status}. Number of filesets: {fileset_insertion_count}. Transaction: {transaction_id}"
+ console_log(log_text)
except Exception as e:
print("Inserting failed:", e)
else:
@@ -585,11 +633,13 @@ def db_insert(data_arr, username=None, skiplog=False):
def compare_filesets(id1, id2, conn):
with conn.cursor() as cursor:
cursor.execute(
- f"SELECT name, size, `size-r`, `size-rd`, checksum FROM file WHERE fileset = '{id1}'"
+ "SELECT name, size, `size-r`, `size-rd`, checksum FROM file WHERE fileset = %s",
+ (id1,),
)
fileset1 = cursor.fetchall()
cursor.execute(
- f"SELECT name, size, `size-r`, `size-rd`, checksum FROM file WHERE fileset = '{id2}'"
+ "SELECT name, size, `size-r`, `size-rd`, checksum FROM file WHERE fileset = %s",
+ (id2,),
)
fileset2 = cursor.fetchall()
@@ -623,9 +673,9 @@ def find_matching_game(game_files):
for file in game_files:
checksum = file[1]
- query = f"SELECT file.fileset as file_fileset FROM filechecksum JOIN file ON filechecksum.file = file.id WHERE filechecksum.checksum = '{checksum}' AND file.detection = TRUE"
+ query = "SELECT file.fileset as file_fileset FROM filechecksum JOIN file ON filechecksum.file = file.id WHERE filechecksum.checksum = %s AND file.detection = TRUE"
with conn.cursor() as cursor:
- cursor.execute(query)
+ cursor.execute(query, (checksum,))
records = cursor.fetchall()
# If file is not part of detection entries, skip it
@@ -640,7 +690,8 @@ def find_matching_game(game_files):
for key, value in Counter(matching_filesets).items():
with conn.cursor() as cursor:
cursor.execute(
- f"SELECT COUNT(file.id) FROM file JOIN fileset ON file.fileset = fileset.id WHERE fileset.id = '{key}'"
+ "SELECT COUNT(file.id) FROM file JOIN fileset ON file.fileset = fileset.id WHERE fileset.id = %s",
+ (key,),
)
count_files_in_fileset = cursor.fetchone()["COUNT(file.id)"]
@@ -651,7 +702,8 @@ def find_matching_game(game_files):
with conn.cursor() as cursor:
cursor.execute(
- f"SELECT engineid, game.id, gameid, platform, language, `key`, src, fileset.id as fileset FROM game JOIN fileset ON fileset.game = game.id JOIN engine ON engine.id = game.engine WHERE fileset.id = '{key}'"
+ "SELECT engineid, game.id, gameid, platform, language, `key`, src, fileset.id as fileset FROM game JOIN fileset ON fileset.game = game.id JOIN engine ON engine.id = game.engine WHERE fileset.id = %s",
+ (key,),
)
records = cursor.fetchall()
@@ -675,7 +727,7 @@ def find_matching_game(game_files):
if compare_filesets(matching_games[0]["fileset"], game_files[0][0], conn):
with conn.cursor() as cursor:
cursor.execute(
- f"UPDATE fileset SET `delete` = TRUE WHERE id = {game_files[0][0]}"
+ "UPDATE fileset SET `delete` = TRUE WHERE id = %s", (game_files[0][0],)
)
return []
@@ -688,7 +740,8 @@ def merge_filesets(detection_id, dat_id):
try:
with conn.cursor() as cursor:
cursor.execute(
- f"SELECT DISTINCT(filechecksum.checksum), checksize, checktype FROM filechecksum JOIN file on file.id = filechecksum.file WHERE fileset = '{detection_id}'"
+ "SELECT DISTINCT(filechecksum.checksum), checksize, checktype FROM filechecksum JOIN file on file.id = filechecksum.file WHERE fileset = %s'",
+ (detection_id,),
)
detection_files = cursor.fetchall()
@@ -698,22 +751,26 @@ def merge_filesets(detection_id, dat_id):
checktype = file[2]
cursor.execute(
- f"DELETE FROM file WHERE checksum = '{checksum}' AND fileset = {detection_id} LIMIT 1"
+ "DELETE FROM file WHERE checksum = %s AND fileset = %s LIMIT 1",
+ (checksum, detection_id),
)
cursor.execute(
- f"UPDATE file JOIN filechecksum ON filechecksum.file = file.id SET detection = TRUE, checksize = {checksize}, checktype = '{checktype}' WHERE fileset = '{dat_id}' AND filechecksum.checksum = '{checksum}'"
+ "UPDATE file JOIN filechecksum ON filechecksum.file = file.id SET detection = TRUE, checksize = %s, checktype = %s WHERE fileset = %s AND filechecksum.checksum = %s",
+ (checksize, checktype, dat_id, checksum),
)
cursor.execute(
- f"INSERT INTO history (`timestamp`, fileset, oldfileset) VALUES (FROM_UNIXTIME({int(time.time())}), {dat_id}, {detection_id})"
+ "INSERT INTO history (`timestamp`, fileset, oldfileset) VALUES (FROM_UNIXTIME(%s), %s, %s)",
+ (int(time.time()), dat_id, detection_id),
)
cursor.execute("SELECT LAST_INSERT_ID()")
history_last = cursor.fetchone()["LAST_INSERT_ID()"]
cursor.execute(
- f"UPDATE history SET fileset = {dat_id} WHERE fileset = {detection_id}"
+ "UPDATE history SET fileset = %s WHERE fileset = %s",
+ (dat_id, detection_id),
)
- cursor.execute(f"DELETE FROM fileset WHERE id = {detection_id}")
+ cursor.execute("DELETE FROM fileset WHERE id = %s", (detection_id,))
conn.commit()
except Exception as e:
@@ -770,11 +827,13 @@ def populate_matching_games():
log_text = f"Matched game {matched_game['engineid']}:\n{matched_game['gameid']}-{matched_game['platform']}-{matched_game['language']}\nvariant {matched_game['key']}. State {status}. Fileset:{fileset[0][0]}."
# Updating the fileset.game value to be $matched_game["id"]
- query = f"UPDATE fileset SET game = {matched_game['id']}, status = '{status}', `key` = '{matched_game['key']}' WHERE id = {fileset[0][0]}"
+ query = "UPDATE fileset SET game = %s, status = %s, `key` = %s WHERE id = %s"
history_last = merge_filesets(matched_game["fileset"], fileset[0][0])
- if cursor.execute(query):
+ if cursor.execute(
+ query, (matched_game["id"], status, matched_game["key"], fileset[0][0])
+ ):
user = f"cli:{getpass.getuser()}"
create_log(
@@ -793,7 +852,7 @@ def populate_matching_games():
# Add log id to the history table
cursor.execute(
- f"UPDATE history SET log = {log_last} WHERE id = {history_last}"
+ "UPDATE history SET log = %s WHERE id = %s", (log_last, history_last)
)
try:
@@ -831,7 +890,7 @@ def match_fileset(data_arr, username=None, skiplog=False):
detection = src == "scummvm"
source_status = "detection" if detection else src
- conn.cursor().execute(f"SET @fileset_time_last = {int(time.time())}")
+ conn.cursor().execute("SET @fileset_time_last = %s", (int(time.time()),))
with conn.cursor() as cursor:
cursor.execute("SELECT MAX(`transaction`) FROM transactions")
@@ -839,8 +898,9 @@ def match_fileset(data_arr, username=None, skiplog=False):
transaction_id = transaction_id + 1 if transaction_id else 1
category_text = f"Uploaded from {src}"
- log_text = f"Started loading DAT file, size {os.path.getsize(filepath)}, author {author}, version {version}. State {source_status}. Transaction: {transaction_id}"
-
+ log_text = f"Started loading DAT file {filepath}, size {os.path.getsize(filepath)}, author {author}, version {version}. State {source_status}. Transaction: {transaction_id}"
+ console_log(log_text)
+ console_log_total_filesets(filepath)
user = f"cli:{getpass.getuser()}" if username is None else username
create_log(escape_string(category_text), user, escape_string(log_text), conn)
@@ -859,6 +919,21 @@ def match_fileset(data_arr, username=None, skiplog=False):
user,
skiplog,
)
+ elif src == "scan":
+ scan_process(
+ game_data,
+ resources,
+ detection,
+ src,
+ conn,
+ transaction_id,
+ filepath,
+ author,
+ version,
+ source_status,
+ user,
+ skiplog,
+ )
else:
game_data_lookup = {fs["name"]: fs for fs in game_data}
for fileset in game_data:
@@ -881,6 +956,720 @@ def match_fileset(data_arr, username=None, skiplog=False):
)
+def scan_process(
+ game_data,
+ resources,
+ detection,
+ src,
+ conn,
+ transaction_id,
+ filepath,
+ author,
+ version,
+ source_status,
+ user,
+ skiplog,
+):
+ """
+ Entry point for processing logic for scan.dat.
+ First Pass - Update all files with matching checksum and file size.
+ Second Pass - Filter candidate with matching with filename, filesize and filechecksum
+ - Perform matching.
+ """
+
+ manual_merged_filesets = 0
+ automatic_merged_filesets = 0
+ match_with_full_fileset = 0
+ mismatch_with_full_fileset = 0
+ dropped_early_no_candidate = 0
+ manual_merged_with_detection = 0
+ filesets_with_missing_files = 0
+
+ id_to_fileset_mapping = defaultdict(dict)
+
+ fileset_count = 0
+ for fileset in game_data:
+ console_log_file_update(fileset_count)
+ key = calc_key(fileset)
+ megakey = ""
+ log_text = f"State {source_status}."
+
+ (fileset_id, existing) = insert_new_fileset(
+ fileset,
+ conn,
+ detection,
+ src,
+ key,
+ megakey,
+ transaction_id,
+ log_text,
+ user,
+ skiplog=skiplog,
+ )
+ if existing:
+ continue
+
+ id_to_fileset_mapping[fileset_id] = fileset
+
+ # set of filesets whose files got updated
+ filesets_check_for_full = set()
+
+ for rom in fileset["rom"]:
+ pre_update_files(rom, filesets_check_for_full, transaction_id, conn)
+ fileset_count += 1
+
+ fileset_count = 0
+ for fileset_id, fileset in id_to_fileset_mapping.items():
+ console_log_matching(fileset_count)
+ candidate_filesets = scan_filter_candidate_filesets(
+ fileset_id, fileset, transaction_id, conn
+ )
+
+ if len(candidate_filesets) == 0:
+ category_text = "Drop fileset - No Candidates"
+ fileset_name = fileset["name"] if "name" in fileset else ""
+ fileset_description = (
+ fileset["description"] if "description" in fileset else ""
+ )
+ log_text = f"Drop fileset as no matching candidates. Name: {fileset_name}, Description: {fileset_description}."
+ create_log(
+ escape_string(category_text), user, escape_string(log_text), conn
+ )
+ dropped_early_no_candidate += 1
+ delete_original_fileset(fileset_id, conn)
+ continue
+
+ (
+ automatic_merged_filesets,
+ manual_merged_filesets,
+ match_with_full_fileset,
+ mismatch_with_full_fileset,
+ manual_merged_with_detection,
+ filesets_with_missing_files,
+ ) = scan_perform_match(
+ fileset,
+ src,
+ user,
+ fileset_id,
+ detection,
+ candidate_filesets,
+ automatic_merged_filesets,
+ manual_merged_filesets,
+ match_with_full_fileset,
+ mismatch_with_full_fileset,
+ manual_merged_with_detection,
+ filesets_with_missing_files,
+ conn,
+ skiplog,
+ )
+ fileset_count += 1
+
+ # Final log
+ with conn.cursor() as cursor:
+ cursor.execute(
+ "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s",
+ (transaction_id,),
+ )
+ fileset_insertion_count = cursor.fetchone()["COUNT(fileset)"]
+ category_text = f"Uploaded from {src}"
+ log_text = f"Completed loading DAT file, filename {filepath}, size {os.path.getsize(filepath)}. State {source_status}. Number of filesets: {fileset_insertion_count}. Transaction: {transaction_id}"
+ create_log(escape_string(category_text), user, escape_string(log_text), conn)
+ category_text = "Upload information"
+ log_text = f"Number of filesets: {fileset_insertion_count}. Filesets automatically merged: {automatic_merged_filesets}. Filesets requiring manual merge (multiple candidates): {manual_merged_filesets}. Filesets requiring manual merge (matched with detection): {manual_merged_with_detection}. Filesets dropped, no candidate: {dropped_early_no_candidate}. Filesets matched with existing Full fileset: {match_with_full_fileset}. Filesets with mismatched files with Full fileset: {mismatch_with_full_fileset}. Filesets missing files compared to partial fileset candidate: {filesets_with_missing_files}."
+ console_log(log_text)
+ create_log(escape_string(category_text), user, escape_string(log_text), conn)
+
+
+def pre_update_files(rom, filesets_check_for_full, transaction_id, conn):
+ """
+ Updates all the checksums for the files matching by a checksum and size.
+ """
+ with conn.cursor() as cursor:
+ checksums = defaultdict(str)
+ for key in rom:
+ if key not in ["name", "size", "size-r", "size-rd", "modification-time"]:
+ checksums[key] = rom[key]
+
+ files_to_update = set()
+ size = rom["size"] if "size" in rom else 0
+ size_r = rom["size-r"] if "size-r" in rom else 0
+ size_rd = rom["size-rd"] if "size-rd" in rom else 0
+
+ for _, checksum in checksums.items():
+ query = """
+ SELECT f.id as file_id, fs.id as fileset_id
+ FROM file f
+ JOIN filechecksum fc ON fc.file = f.id
+ JOIN fileset fs ON fs.id = f.fileset
+ JOIN transactions t ON t.fileset = fs.id
+ WHERE fc.checksum = %s
+ AND f.size = %s
+ AND f.`size-r` = %s
+ AND f.`size-rd` = %s
+ AND t.transaction != %s
+ """
+
+ cursor.execute(query, (checksum, size, size_r, size_rd, transaction_id))
+ result = cursor.fetchall()
+ if result:
+ for file in result:
+ filesets_check_for_full.add(file["fileset_id"])
+ files_to_update.add(file["file_id"])
+
+ for file_id in files_to_update:
+ query = """
+ DELETE FROM filechecksum
+ WHERE file = %s
+ """
+ cursor.execute(query, (file_id,))
+ # Update checksums
+ for check, checksum in checksums.items():
+ checksize, checktype, checksum = get_checksum_props(check, checksum)
+ query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)"
+ cursor.execute(query, (file_id, checksize, checktype, checksum))
+ # Update sizes
+ query = """
+ UPDATE file
+ SET size = %s,
+ `size-r` = %s,
+ `size-rd` = %s,
+ name = %s
+ WHERE id = %s
+ """
+ cursor.execute(
+ query, (size, size_r, size_rd, normalised_path(rom["name"]), file_id)
+ )
+
+
+def scan_perform_match(
+ fileset,
+ src,
+ user,
+ fileset_id,
+ detection,
+ candidate_filesets,
+ automatic_merged_filesets,
+ manual_merged_filesets,
+ match_with_full_fileset,
+ mismatch_with_full_fileset,
+ manual_merged_with_detection,
+ filesets_with_missing_files,
+ conn,
+ skiplog,
+):
+ """
+ Performs matching for scan.dat.
+ If single candidate for match:
+ detection -> Copy all the files and checksums from scan.
+ partial -> Copy all the files and checksums from scan.
+ full -> Drop the scan fileset. But show the differences in file if any.
+ If more than one candidate for match:
+ Put them for manual merge.
+ """
+ with conn.cursor() as cursor:
+ if len(candidate_filesets) == 1:
+ matched_fileset_id = candidate_filesets[0]
+ cursor.execute(
+ "SELECT status FROM fileset WHERE id = %s", (matched_fileset_id,)
+ )
+ status = cursor.fetchone()["status"]
+ # Partial filesets can be turned full directly, as the files have already been updated.
+ # But the files that had missing size were not updated, so we need to check.
+ if status == "partial":
+ # Partial filesets contain all the files, so does the scanned filesets, so this case should not ideally happen.
+ if total_files(matched_fileset_id, conn) > total_fileset_files(fileset):
+ category_text = "Missing files"
+ log_text = f"Missing files in Fileset:{fileset_id}. Try manual merge with Fileset:{matched_fileset_id}."
+ add_manual_merge(
+ candidate_filesets,
+ fileset_id,
+ category_text,
+ log_text,
+ user,
+ conn,
+ log_text,
+ )
+ filesets_with_missing_files += 1
+
+ else:
+ update_all_files(fileset, matched_fileset_id, False, conn)
+ update_fileset_status(cursor, matched_fileset_id, "full")
+ if not skiplog:
+ log_matched_fileset(
+ src,
+ fileset_id,
+ matched_fileset_id,
+ "full",
+ user,
+ conn,
+ )
+ delete_original_fileset(fileset_id, conn)
+ automatic_merged_filesets += 1
+
+ # Detection filests can be turned full if the number of files are equal,
+ # otherwise we do manual merge to remove extra files.
+ elif status == "detection":
+ if total_fileset_files(fileset) == total_files(
+ matched_fileset_id, conn, detection_only=True
+ ):
+ update_all_files(fileset, matched_fileset_id, True, conn)
+ update_fileset_status(cursor, matched_fileset_id, "full")
+ if not skiplog:
+ log_matched_fileset(
+ src,
+ fileset_id,
+ matched_fileset_id,
+ "full",
+ user,
+ conn,
+ )
+ delete_original_fileset(fileset_id, conn)
+ automatic_merged_filesets += 1
+
+ else:
+ category_text = "Manual Merge - Detection found"
+ log_text = f"Matched with detection. Merge Fileset:{fileset_id} manually with Fileset:{matched_fileset_id}."
+ add_manual_merge(
+ candidate_filesets,
+ fileset_id,
+ category_text,
+ log_text,
+ user,
+ conn,
+ log_text,
+ )
+ manual_merged_with_detection += 1
+
+ # Drop the fileset, note down the file differences
+ elif status == "full":
+ (unmatched_candidate_files, unmatched_scan_files) = get_unmatched_files(
+ matched_fileset_id, fileset, conn
+ )
+ fully_matched = (
+ True
+ if len(unmatched_candidate_files) == 0
+ and len(unmatched_scan_files) == 0
+ else False
+ )
+ if fully_matched:
+ match_with_full_fileset += 1
+ else:
+ mismatch_with_full_fileset += 1
+ log_scan_match_with_full(
+ fileset_id,
+ matched_fileset_id,
+ unmatched_candidate_files,
+ unmatched_scan_files,
+ fully_matched,
+ user,
+ conn,
+ )
+ delete_original_fileset(fileset_id, conn)
+
+ elif len(candidate_filesets) > 1:
+ category_text = "Manual Merge - Multiple Candidates"
+ log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidate_filesets)}."
+ manual_merged_filesets += 1
+ add_manual_merge(
+ candidate_filesets,
+ fileset_id,
+ category_text,
+ log_text,
+ user,
+ conn,
+ log_text,
+ )
+
+ return (
+ automatic_merged_filesets,
+ manual_merged_filesets,
+ match_with_full_fileset,
+ mismatch_with_full_fileset,
+ manual_merged_with_detection,
+ filesets_with_missing_files,
+ )
+
+
+def update_all_files(fileset, candidate_fileset_id, is_candidate_detection, conn):
+ """
+ Updates all the files, if they were missed out earlier due to missing size.
+ """
+ with conn.cursor() as cursor:
+ # Extracting the filename from the filepath.
+ cursor.execute(
+ "SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = %s",
+ (candidate_fileset_id,),
+ )
+ target_files = cursor.fetchall()
+ candidate_files = {
+ target_file["id"]: target_file["name"].lower()
+ for target_file in target_files
+ }
+
+ scan_checksums = set()
+ scan_names_by_checksum = defaultdict(str)
+ same_filename_count = defaultdict(int)
+
+ filename_to_filepath_map = defaultdict(str)
+ filepath_to_checksum_map = defaultdict(dict)
+ filepath_to_sizes_map = defaultdict(dict)
+
+ for file in fileset["rom"]:
+ base_name = os.path.basename(normalised_path(file["name"])).lower()
+ checksums = defaultdict(str)
+ sizes = defaultdict(int)
+ for key in file:
+ if key.startswith("md5"):
+ scan_checksums.add((file[key], base_name))
+ scan_names_by_checksum[(file[key], base_name)] = file["name"]
+ checksums[key] = file[key]
+ if key.startswith("size"):
+ sizes[key] = file[key]
+
+ filepath_to_sizes_map[file["name"]] = sizes
+ filepath_to_checksum_map[file["name"]] = checksums
+ same_filename_count[base_name] += 1
+ filename_to_filepath_map[base_name] = file["name"]
+
+ checksums = defaultdict(dict)
+ filepath = ""
+
+ for file_id, file_name in candidate_files.items():
+ file_name = file_name.lower()
+ # Match by filename
+ if same_filename_count[file_name] == 1:
+ filepath = filename_to_filepath_map[file_name]
+ checksums = filepath_to_checksum_map[filepath]
+
+ # If same filename occurs multiple times, fallback to checksum based match
+ else:
+ cursor.execute(
+ "SELECT checksum FROM filechecksum WHERE file = %s", (file_id,)
+ )
+ checksum_rows = cursor.fetchall()
+ for row in checksum_rows:
+ checksum = row["checksum"]
+ if (checksum, file_name) in scan_checksums:
+ filepath = scan_names_by_checksum[(checksum, file_name)]
+ checksums = filepath_to_checksum_map[filepath]
+
+ # Delete older checksums
+ query = """
+ DELETE FROM filechecksum
+ WHERE file = %s
+ """
+ cursor.execute(query, (file_id,))
+ # Update the checksums
+ for key, checksum in checksums.items():
+ checksize, checktype, checksum = get_checksum_props(key, checksum)
+ query = "INSERT INTO filechecksum (file, checksize, checktype, checksum) VALUES (%s, %s, %s, %s)"
+ cursor.execute(query, (file_id, checksize, checktype, checksum))
+
+ # Also updates the sizes, do not update the name if fileset not in detection state
+ query = """
+ UPDATE file
+ SET size = %s,
+ `size-r` = %s,
+ `size-rd` = %s
+ """
+ sizes = filepath_to_sizes_map[filepath]
+ if is_candidate_detection:
+ query += ",name = %s WHERE id = %s"
+ params = (
+ sizes["size"],
+ sizes["size-r"],
+ sizes["size-rd"],
+ normalised_path(filepath),
+ file_id,
+ )
+ else:
+ query += "WHERE id = %s"
+ params = (sizes["size"], sizes["size-r"], sizes["size-rd"], file_id)
+ cursor.execute(query, params)
+
+
+def total_files(fileset_id, conn, detection_only=False):
+ """
+ Returns the total number of files (only detection files if detection_only set to true) present in the given fileset from the database.
+ """
+ with conn.cursor() as cursor:
+ query = """
+ SELECT COUNT(*) AS count
+ FROM file f
+ JOIN fileset fs ON fs.id = f.fileset
+ """
+ if detection_only:
+ query += """
+ WHERE f.detection = 1
+ AND fs.id = %s
+ """
+ else:
+ query += "WHERE fs.id = %s"
+ cursor.execute(query, (fileset_id,))
+ return cursor.fetchone()["count"]
+
+
+def total_fileset_files(fileset):
+ """
+ Returns the number of files present in the fileset
+ """
+ return len(fileset["rom"])
+
+
+def scan_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn):
+ """
+ Returns a list of candidate filesets that can be merged.
+ Performs early filtering in SQL (by name, size) and then
+ applies checksum filtering and max-match filtering in Python.
+ """
+ with conn.cursor() as cursor:
+ # Fetching detection filename and all sizes (size, size-r, size-rd) from database
+ query = """
+ SELECT fs.id AS fileset_id, f.id as file_id, f.name, f.size,
+ f.`size-r` AS size_r, f.`size-rd` AS size_rd
+ FROM file f
+ JOIN fileset fs ON f.fileset = fs.id
+ JOIN game g ON g.id = fs.game
+ JOIN engine e ON e.id = g.engine
+ JOIN transactions t ON t.fileset = fs.id
+ WHERE f.detection = 1
+ AND t.transaction != %s
+ """
+ cursor.execute(query, (transaction_id,))
+ raw_candidates = cursor.fetchall()
+
+ # fileset id to detection files map
+ candidate_map = defaultdict(list)
+ total_detection_files_map = defaultdict(int)
+ for row in raw_candidates:
+ candidate_map[row["fileset_id"]].append(
+ {
+ "file_id": row["file_id"],
+ "name": os.path.basename(normalised_path(row["name"])).lower(),
+ "size": row["size"],
+ "size-r": row["size_r"],
+ "size-rd": row["size_rd"],
+ }
+ )
+ for id, files in candidate_map.items():
+ total_detection_files_map[id] = len(files)
+
+ set_checksums = set()
+ set_file_name_size = set()
+ for file in fileset["rom"]:
+ name = os.path.basename(normalised_path(file["name"]))
+ for key in file:
+ if key.startswith("md5"):
+ set_checksums.add(
+ (
+ file[key],
+ name.lower(),
+ int(file["size"]),
+ int(file["size-r"]),
+ int(file["size-rd"]),
+ )
+ )
+ set_checksums.add(
+ (
+ file[key],
+ name.lower(),
+ -1,
+ int(file["size-r"]),
+ int(file["size-rd"]),
+ )
+ )
+ set_file_name_size.add(
+ (name.lower(), -1, int(file["size-r"]), int(file["size-rd"]))
+ )
+ set_file_name_size.add(
+ (name.lower(), int(file["size"]), int(file["size-r"]), int(file["size-rd"]))
+ )
+
+ # Filter candidates by detection filename and file size (including -1) and increase matched file count
+ # if filesize = -1,
+ # elif filesize <= checksize and checksum matches,
+ # elif filesize > checksize.
+ match_counts = {}
+ for fileset_id, files in candidate_map.items():
+ count = 0
+ with conn.cursor() as cursor:
+ for f in files:
+ filename = os.path.basename(f["name"]).lower()
+ size = f["size"]
+ size_r = f["size-r"]
+ size_rd = f["size-rd"]
+ if (filename, size, size_r, size_rd) in set_file_name_size:
+ if size == -1:
+ count += 1
+ else:
+ cursor.execute(
+ """
+ SELECT checksum, checksize, checktype
+ FROM filechecksum
+ WHERE file = %s
+ """,
+ (f["file_id"],),
+ )
+ checksums = cursor.fetchall()
+ not_inc_count = False
+ for c in checksums:
+ filesize = size
+ checksum = c["checksum"]
+ checksize = c["checksize"]
+ checktype = c["checktype"]
+ # Macfiles handling
+ if checktype in ["md5-r", "md5-rt"]:
+ filesize = size_rd
+
+ if checksize == "1M":
+ checksize = 1048576
+ elif checksize == "0":
+ checksize = filesize
+ if filesize <= int(checksize):
+ if (
+ checksum,
+ filename,
+ size,
+ size_r,
+ size_rd,
+ ) in set_checksums:
+ count += 1
+ not_inc_count = True
+ # if it was a true match, checksum should be present
+ break
+ if not not_inc_count:
+ count += 1
+ if count > 0 and total_detection_files_map[fileset_id] <= count:
+ match_counts[fileset_id] = count
+
+ # Filter only entries with maximum number of matched files
+ if not match_counts:
+ return []
+
+ max_match = max(match_counts.values())
+ candidates = [fid for fid, count in match_counts.items() if count == max_match]
+
+ matched_candidates = []
+ for candidate in candidates:
+ if is_full_detection_checksum_match(candidate, fileset, conn):
+ matched_candidates.append(candidate)
+
+ if len(matched_candidates) != 0:
+ candidates = matched_candidates
+
+ return candidates
+
+
+def get_unmatched_files(candidate_fileset, fileset, conn):
+ """
+ Checks if all checksums from candidate_fileset match dat file checksums.
+ Returns:
+ unmatched_candidate_files: candidate files whose checksums weren't found in scan
+ unmatched_dat_files: dat files whose checksums weren't matched by candidate
+ """
+ with conn.cursor() as cursor:
+ cursor.execute(
+ "SELECT id, name FROM file WHERE fileset = %s", (candidate_fileset,)
+ )
+ candidate_file_rows = cursor.fetchall()
+ candidate_files = {row["id"]: row["name"] for row in candidate_file_rows}
+
+ dat_checksums = set()
+ dat_names_by_checksum = {}
+
+ for file in fileset["rom"]:
+ base_name = os.path.basename(normalised_path(file["name"])).lower()
+ for key in file:
+ if key.startswith("md5"):
+ dat_checksums.add((file[key], base_name))
+ dat_names_by_checksum[(file[key], base_name)] = file["name"]
+
+ unmatched_candidate_files = []
+ matched_dat_pairs = set()
+
+ for file_id, file_name in candidate_files.items():
+ cursor.execute(
+ "SELECT checksum FROM filechecksum WHERE file = %s", (file_id,)
+ )
+ checksum_rows = cursor.fetchall()
+
+ base_name = os.path.basename(file_name).lower()
+ match_found = False
+
+ for row in checksum_rows:
+ checksum = row["checksum"]
+ if (checksum, base_name) in dat_checksums:
+ matched_dat_pairs.add((checksum, base_name))
+ match_found = True
+
+ if not match_found:
+ unmatched_candidate_files.append(file_name)
+
+ unmatched_dat_files = {
+ dat_names_by_checksum[key]
+ for key in dat_checksums
+ if key not in matched_dat_pairs
+ }
+ unmatched_dat_files = list(unmatched_dat_files)
+
+ return (unmatched_candidate_files, unmatched_dat_files)
+
+
+def is_full_detection_checksum_match(candidate_fileset, fileset, conn):
+ """
+ Return type - Boolean
+ Checks if all the detection files in the candidate fileset have corresponding checksums matching with scan.
+
+ scan - rom ( name "AFM Read Me!_2" size 8576 size-r 1 size-rd 0 modification-time 1993-05-12 md5 dsd16ccea050db521a678a1cdc33794c md5-5000 008e76ec3ae58d0add637ea7aa299a2a md5-t-5000 118e76ec3ae58d0add637ea7aa299a2c md5-1048576 37d16ccea050db521a678a1cdc33794c)
+ """
+ with conn.cursor() as cursor:
+ cursor.execute(
+ "SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name FROM file WHERE detection=1 AND fileset = %s",
+ (candidate_fileset,),
+ )
+ target_files = cursor.fetchall()
+ candidate_files = {
+ target_file["id"]: target_file["name"] for target_file in target_files
+ }
+
+ # set of (checksum, filename)
+ scan_checksums = set()
+ for file in fileset["rom"]:
+ for key in file:
+ if key.startswith("md5"):
+ name = os.path.basename(normalised_path(file["name"]))
+ scan_checksums.add((file[key], name.lower()))
+
+ for detection_file_id, detection_file_name in candidate_files.items():
+ query = """
+ SELECT fc.checksum, fc.checksize, fc.checktype
+ FROM filechecksum fc
+ WHERE fc.file = %s
+ """
+ cursor.execute(query, (detection_file_id,))
+ checksums_info = cursor.fetchall()
+ match_found = False
+ if checksums_info:
+ for checksum_info in checksums_info:
+ checksum = checksum_info["checksum"]
+ if (
+ checksum,
+ os.path.basename(detection_file_name.lower()),
+ ) not in scan_checksums:
+ match_found = True
+ break
+
+ if match_found:
+ return False
+
+ return True
+
+
+# -------------------------------------------------------------------------------------------------------
+# Set.dat processing below
+# -------------------------------------------------------------------------------------------------------
+
+
def set_process(
game_data,
resources,
@@ -909,6 +1698,9 @@ def set_process(
mismatch_filesets = 0
dropped_early_no_candidate = 0
dropped_early_single_candidate_multiple_sets = 0
+
+ fileset_count = 0
+
# A mapping from set filesets to candidate filesets list
set_to_candidate_dict = defaultdict(list)
id_to_fileset_dict = defaultdict(dict)
@@ -937,6 +1729,11 @@ def set_process(
megakey = ""
log_text = f"State {source_status}."
+ set_dat_metadata = ""
+ for meta in fileset:
+ if meta != "rom":
+ set_dat_metadata += meta + " = " + fileset[meta] + " , "
+
(fileset_id, existing) = insert_new_fileset(
fileset,
conn,
@@ -947,13 +1744,18 @@ def set_process(
transaction_id,
log_text,
user,
+ set_dat_metadata=set_dat_metadata,
skiplog=skiplog,
)
+
if existing:
continue
- candidate_filesets = set_filter_candidate_filesets(
- fileset_id, fileset, transaction_id, conn
+ # Separating out the matching logic for glk engine
+ engine_name = fileset["sourcefile"].split("-")[0]
+
+ (candidate_filesets, fileset_count) = set_filter_candidate_filesets(
+ fileset_id, fileset, fileset_count, transaction_id, engine_name, conn
)
# Mac files in set.dat are not represented properly and they won't find a candidate fileset for a match, so we can drop them.
@@ -969,11 +1771,19 @@ def set_process(
)
dropped_early_no_candidate += 1
delete_original_fileset(fileset_id, conn)
-
id_to_fileset_dict[fileset_id] = fileset
set_to_candidate_dict[fileset_id].extend(candidate_filesets)
- # Remove all such filesets, which have many to one mapping with a single candidate, those are extra variants.
+ console_message = "Candidate filtering finished."
+ console_log(console_message)
+ console_message = (
+ f"{dropped_early_no_candidate} Filesets Dropped for No candidates."
+ )
+ console_log(console_message)
+ console_message = "Looking for duplicates..."
+ console_log(console_message)
+
+ # Remove all such filesets, which have many to one mapping with a single candidate, just merge one of them.
value_to_keys = defaultdict(list)
for set_fileset, candidates in set_to_candidate_dict.items():
if len(candidates) == 1:
@@ -997,7 +1807,12 @@ def set_process(
platform = result["platform"]
language = result["language"]
+ # Skip the first entry, let it merge and drop others
+ skip = True
for set_fileset in set_filesets:
+ if skip:
+ skip = False
+ continue
fileset = id_to_fileset_dict[set_fileset]
category_text = "Drop fileset - Duplicates"
fileset_name = fileset["name"] if "name" in fileset else ""
@@ -1005,6 +1820,7 @@ def set_process(
fileset["description"] if "description" in fileset else ""
)
log_text = f"Drop fileset, multiple filesets mapping to single detection. Name: {fileset_name}, Description: {fileset_description}. Clashed with Fileset:{candidate} ({engine}:{gameid}-{platform}-{language})"
+ console_log(log_text)
create_log(
escape_string(category_text), user, escape_string(log_text), conn
)
@@ -1013,8 +1829,18 @@ def set_process(
del set_to_candidate_dict[set_fileset]
del id_to_fileset_dict[set_fileset]
+ manual_merge_map = defaultdict(list)
+
+ match_count = 1
for fileset_id, candidate_filesets in set_to_candidate_dict.items():
+ console_log_matching(match_count)
fileset = id_to_fileset_dict[fileset_id]
+
+ # Filter by platform to reduce manual merge
+ # candidate_filesets = set_filter_by_platform(
+ # fileset["name"], candidate_filesets, conn
+ # )
+
(
fully_matched_filesets,
auto_merged_filesets,
@@ -1031,12 +1857,44 @@ def set_process(
auto_merged_filesets,
manual_merged_filesets,
mismatch_filesets,
+ manual_merge_map,
+ set_to_candidate_dict,
conn,
skiplog,
)
- # Final log
+ match_count += 1
+ console_log("Matching performed.")
+
with conn.cursor() as cursor:
+ for fileset_id, candidates in manual_merge_map.items():
+ if len(candidates) == 0:
+ category_text = "Drop fileset - No Candidates"
+ fileset = id_to_fileset_dict[fileset_id]
+ fileset_name = fileset["name"] if "name" in fileset else ""
+ fileset_description = (
+ fileset["description"] if "description" in fileset else ""
+ )
+ log_text = f"Drop fileset as no matching candidates. Name: {fileset_name}, Description: {fileset_description}."
+ create_log(
+ escape_string(category_text), user, escape_string(log_text), conn
+ )
+ dropped_early_no_candidate += 1
+ delete_original_fileset(fileset_id, conn)
+ else:
+ category_text = "Manual Merge Required"
+ log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidates)}."
+ manual_merged_filesets += 1
+ add_manual_merge(
+ candidates,
+ fileset_id,
+ category_text,
+ log_text,
+ user,
+ conn,
+ log_text,
+ )
+
cursor.execute(
"SELECT COUNT(fileset) from transactions WHERE `transaction` = %s",
(transaction_id,),
@@ -1047,9 +1905,48 @@ def set_process(
create_log(escape_string(category_text), user, escape_string(log_text), conn)
category_text = "Upload information"
log_text = f"Number of filesets: {fileset_insertion_count}. Filesets automatically merged: {auto_merged_filesets}. Filesets dropped early (no candidate) - {dropped_early_no_candidate}. Filesets dropped early (mapping to single detection) - {dropped_early_single_candidate_multiple_sets}. Filesets requiring manual merge: {manual_merged_filesets}. Partial/Full filesets already present: {fully_matched_filesets}. Partial/Full filesets with mismatch {mismatch_filesets}."
+ console_log(log_text)
create_log(escape_string(category_text), user, escape_string(log_text), conn)
+def set_filter_by_platform(gameid, candidate_filesets, conn):
+ """
+ Return - list(number) : list of fileset ids of filtered candidates.
+ The number of manual merges in case the file size is not present (equal to -1) are too high. So we try to filter by platform extracted from the gameId of the set.dat fileset. We may disable this feature later or keep it optional with a command line argument.
+ """
+ with conn.cursor() as cursor:
+ # e.g. sq2-coco3-1
+ possible_platform_names = gameid.split("-")[1:]
+
+ # Align platform names in set.dat and detection entries
+ for i, platform in enumerate(possible_platform_names):
+ if platform == "win":
+ possible_platform_names[i] = "windows"
+ elif platform == "mac":
+ possible_platform_names[i] = "macintosh"
+
+ filtered_candidate_fileset = []
+
+ for candidate_fileset_id in candidate_filesets:
+ query = """
+ SELECT g.platform
+ FROM fileset fs
+ JOIN game g ON g.id = fs.game
+ WHERE fs.id = %s
+ """
+ cursor.execute(query, (candidate_fileset_id,))
+ candidate_platform = cursor.fetchone()["platform"]
+ if candidate_platform in possible_platform_names:
+ filtered_candidate_fileset.append(candidate_fileset_id)
+
+ # If nothing was filtred, then it is likely, that platform information was not present, so we fallback to original list of candidates.
+ return (
+ candidate_filesets
+ if len(filtered_candidate_fileset) == 0
+ else filtered_candidate_fileset
+ )
+
+
def set_perform_match(
fileset,
src,
@@ -1061,16 +1958,17 @@ def set_perform_match(
auto_merged_filesets,
manual_merged_filesets,
mismatch_filesets,
+ manual_merge_map,
+ set_to_candidate_dict,
conn,
skiplog,
):
"""
- TODO
+ "Performs matching for set.dat"
"""
with conn.cursor() as cursor:
if len(candidate_filesets) == 1:
matched_fileset_id = candidate_filesets[0]
-
cursor.execute(
"SELECT status FROM fileset WHERE id = %s", (matched_fileset_id,)
)
@@ -1080,7 +1978,7 @@ def set_perform_match(
set_populate_file(fileset, matched_fileset_id, conn, detection)
auto_merged_filesets += 1
if not skiplog:
- set_log_matched_fileset(
+ log_matched_fileset(
src,
fileset_id,
matched_fileset_id,
@@ -1089,10 +1987,19 @@ def set_perform_match(
conn,
)
delete_original_fileset(fileset_id, conn)
+ remove_manual_merge_if_size_mismatch(
+ matched_fileset_id, manual_merge_map, set_to_candidate_dict, conn
+ )
elif status == "partial" or status == "full":
- (is_match, unmatched_files) = is_full_checksum_match(
+ (unmatched_candidate_files, unmatched_dat_files) = get_unmatched_files(
matched_fileset_id, fileset, conn
)
+ is_match = (
+ True
+ if len(unmatched_candidate_files) == 0
+ and len(unmatched_dat_files) == 0
+ else False
+ )
if is_match:
category_text = "Already present"
log_text = f"Already present as - Fileset:{matched_fileset_id}. Deleting Fileset:{fileset_id}"
@@ -1108,53 +2015,21 @@ def set_perform_match(
else:
category_text = "Mismatch"
- log_text = f"Fileset:{fileset_id} mismatched with Fileset:{matched_fileset_id} with status:{status}. Try manual merge."
- print_text = f"Merge Fileset:{fileset_id} manually with Fileset:{matched_fileset_id}. Unmatched files: {len(unmatched_files)}."
+ log_text = f"Fileset:{fileset_id} mismatched with Fileset:{matched_fileset_id} with status:{status}. Try manual merge. Unmatched Files in set.dat fileset = {len(unmatched_dat_files)} Unmatched Files in candidate fileset = {len(unmatched_candidate_files)}. List of unmatched files scan.dat : {', '.join(scan_file for scan_file in unmatched_dat_files)}, List of unmatched files full fileset : {', '.join(scan_file for scan_file in unmatched_candidate_files)}"
+ console_log(log_text)
+ # print_text = f"Merge Fileset:{fileset_id} manually with Fileset:{matched_fileset_id}. Unmatched files: {len(unmatched_files)}."
mismatch_filesets += 1
add_manual_merge(
[matched_fileset_id],
fileset_id,
category_text,
log_text,
- print_text,
user,
conn,
)
elif len(candidate_filesets) > 1:
- found_match = False
- for candidate_fileset in candidate_filesets:
- (is_match, _) = is_full_checksum_match(candidate_fileset, fileset, conn)
- if is_match:
- update_fileset_status(cursor, candidate_fileset, "partial")
- set_populate_file(fileset, candidate_fileset, conn, detection)
- auto_merged_filesets += 1
- if not skiplog:
- set_log_matched_fileset(
- src,
- fileset_id,
- candidate_fileset,
- "partial",
- user,
- conn,
- )
- delete_original_fileset(fileset_id, conn)
- found_match = True
- break
-
- if not found_match:
- category_text = "Manual Merge Required"
- log_text = f"Merge Fileset:{fileset_id} manually. Possible matches are: {', '.join(f'Fileset:{id}' for id in candidate_filesets)}."
- manual_merged_filesets += 1
- add_manual_merge(
- candidate_filesets,
- fileset_id,
- category_text,
- log_text,
- log_text,
- user,
- conn,
- )
+ manual_merge_map[fileset_id] = candidate_filesets
return (
fully_matched_filesets,
@@ -1164,8 +2039,74 @@ def set_perform_match(
)
+def remove_manual_merge_if_size_mismatch(
+ child_fileset, manual_merge_map, set_to_candidate_dict, conn
+):
+ with conn.cursor() as cursor:
+ query = """
+ SELECT f.name, f.size
+ FROM fileset fs
+ JOIN file f ON f.fileset = fs.id
+ WHERE fs.id = %s
+ AND f.detection = 1
+ """
+ cursor.execute(query, (child_fileset,))
+ files = cursor.fetchall()
+
+ for possible_removals in [manual_merge_map, set_to_candidate_dict]:
+ for parent_fileset, child_list in possible_removals.items():
+ if child_fileset not in child_list:
+ continue
+
+ for file in files:
+ if file["size"] == -1:
+ continue
+
+ query = """
+ SELECT fs.id
+ FROM fileset fs
+ JOIN file f ON f.fileset = fs.id
+ WHERE fs.id = %s
+ AND REGEXP_REPLACE(f.name, '^.*[\\\\/]', '') = %s
+ AND f.size = %s
+ LIMIT 1
+ """
+ filename = os.path.basename(normalised_path(file["name"]))
+ cursor.execute(query, (parent_fileset, filename, file["size"]))
+ result = cursor.fetchall()
+
+ if not result:
+ remove_manual_merge(
+ child_fileset,
+ parent_fileset,
+ manual_merge_map,
+ set_to_candidate_dict,
+ conn,
+ )
+ break
+
+
+def remove_manual_merge(
+ child_fileset, parent_fileset, manual_merge_map, set_to_candidate_dict, conn
+):
+ if parent_fileset in manual_merge_map:
+ if child_fileset in manual_merge_map[parent_fileset]:
+ manual_merge_map[parent_fileset].remove(child_fileset)
+ if parent_fileset in set_to_candidate_dict:
+ if child_fileset in set_to_candidate_dict[parent_fileset]:
+ set_to_candidate_dict[parent_fileset].remove(child_fileset)
+
+ with conn.cursor() as cursor:
+ query = """
+ DELETE FROM possible_merges
+ WHERE child_fileset = %s
+ AND parent_fileset = %s
+ """
+ cursor.execute(query, (child_fileset, parent_fileset))
+
+
def add_manual_merge(
- child_filesets, parent_fileset, category_text, log_text, print_text, user, conn
+ child_filesets, parent_fileset, category_text, log_text, user, conn, print_text=None
):
"""
Adds the manual merge entries to a table called possible_merges.
@@ -1181,7 +2122,8 @@ def add_manual_merge(
cursor.execute(query, (child_fileset, parent_fileset))
create_log(escape_string(category_text), user, escape_string(log_text), conn)
- print(print_text)
+ if print_text:
+ print(print_text)
def is_full_checksum_match(candidate_fileset, fileset, conn):
@@ -1219,71 +2161,190 @@ def is_full_checksum_match(candidate_fileset, fileset, conn):
return (len(unmatched_files) == 0, unmatched_files)
-def set_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn):
+def set_filter_candidate_filesets(
+ fileset_id, fileset, fileset_count, transaction_id, engine_name, conn
+):
"""
- Returns a list of candidate filesets that can be merged
+ Returns a list of candidate filesets that can be merged.
+ Performs early filtering in SQL (by engine, name, size) and then
+ applies checksum filtering and max-match filtering in Python.
+ In case of glk engines, filtering is not by name, rather gameid is used.
"""
+ is_glk = engine_name == "glk"
with conn.cursor() as cursor:
- # Returns those filesets which have all detection files matching in the set fileset filtered by engine, file name and file size(if not -1) sorted in descending order of matches
+ fileset_count += 1
+ console_log_candidate_filtering(fileset_count)
+ # Early filter candidates using enginename, filename and size
query = """
- WITH candidate_fileset AS (
- SELECT fs.id AS fileset_id, f.name, f.size
+ SELECT fs.id AS fileset_id, f.id AS file_id, f.name, f.size
FROM file f
JOIN fileset fs ON f.fileset = fs.id
JOIN game g ON g.id = fs.game
JOIN engine e ON e.id = g.engine
JOIN transactions t ON t.fileset = fs.id
- WHERE fs.id != %s
- AND e.engineid = %s
+ WHERE e.engineid = %s
AND f.detection = 1
AND t.transaction != %s
- ),
- total_detection_files AS (
- SELECT cf.fileset_id, COUNT(*) AS detection_files_found
- FROM candidate_fileset cf
- GROUP BY fileset_id
- ),
- set_fileset AS (
- SELECT name, size FROM file
- WHERE fileset = %s
- ),
- matched_detection_files AS (
- SELECT cf.fileset_id, COUNT(*) AS match_files_count
- FROM candidate_fileset cf
- JOIN set_fileset sf ON ( (
- cf.name = sf.name
- OR
- REGEXP_REPLACE(cf.name, '^.*[\\\\/]', '') = REGEXP_REPLACE(sf.name, '^.*[\\\\/]', '')
- ) AND (cf.size = sf.size OR cf.size = -1) )
- GROUP BY cf.fileset_id
- ),
- valid_matched_detection_files AS (
- SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
- FROM matched_detection_files mdf
- JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
- WHERE tdf.detection_files_found <= mdf.match_files_count
- ),
- max_match_count AS (
- SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
- )
- SELECT vmdf.fileset_id
- FROM valid_matched_detection_files vmdf
- JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
- JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
"""
-
+ if is_glk:
+ query += " AND (g.gameid = %s OR (g.gameid != %s AND g.gameid LIKE %s))"
+ gameid_pattern = f"%{fileset['name']}%"
+ cursor.execute(
+ query,
+ (
+ engine_name,
+ transaction_id,
+ fileset["name"],
+ fileset["name"],
+ gameid_pattern,
+ ),
+ )
+ else:
+ cursor.execute(query, (fileset["sourcefile"], transaction_id))
+ raw_candidates = cursor.fetchall()
+
+ # fileset id to detection files map
+ candidate_map = defaultdict(list)
+ total_detection_files_map = defaultdict(int)
+ for row in raw_candidates:
+ candidate_map[row["fileset_id"]].append(
+ {
+ "file_id": row["file_id"],
+ "name": os.path.basename(normalised_path(row["name"])).lower(),
+ "size": row["size"],
+ }
+ )
+ for id, files in candidate_map.items():
+ total_detection_files_map[id] = len(files)
+
+ set_checksums = set()
+ set_file_name_size = set()
+ set_glk_file_size = set()
+ for file in fileset["rom"]:
+ name = os.path.basename(normalised_path(file["name"]))
+ for key in file:
+ if key.startswith("md5"):
+ set_checksums.add((file[key], name.lower(), int(file["size"])))
+ set_checksums.add((file[key], name.lower(), -1))
+ set_file_name_size.add((name.lower(), -1))
+ set_file_name_size.add((name.lower(), int(file["size"])))
+ if is_glk:
+ set_glk_file_size.add(int(file["size"]))
+
+ # Filter candidates by detection filename and file size (including -1) and increase matched file count
+ # if filesize = -1,
+ # elif filesize <= checksize and checksum matches,
+ # elif filesize > checksize.
+ match_counts = {}
+ for fileset_id, files in candidate_map.items():
+ count = 0
+ with conn.cursor() as cursor:
+ for f in files:
+ filename = os.path.basename(f["name"]).lower()
+ filesize = f["size"]
+ if is_glk and (filesize in set_glk_file_size or filesize == 0):
+ count += 1
+ if (filename, filesize) in set_file_name_size:
+ if filesize == -1:
+ count += 1
+ else:
+ cursor.execute(
+ """
+ SELECT checksum, checksize, checktype
+ FROM filechecksum
+ WHERE file = %s
+ """,
+ (f["file_id"],),
+ )
+ checksums = cursor.fetchall()
+ not_inc_count = False
+ for c in checksums:
+ checksum = c["checksum"]
+ checksize = c["checksize"]
+ if checksize == "1M":
+ checksize = 1048576
+ elif checksize == "0":
+ checksize = filesize
+ if filesize <= int(checksize):
+ if (checksum, filename, filesize) in set_checksums:
+ count += 1
+ not_inc_count = True
+ # if it was a true match, checksum should be present
+ break
+ if not not_inc_count:
+ count += 1
+ if count > 0 and total_detection_files_map[fileset_id] <= count:
+ match_counts[fileset_id] = count
+
+ # Filter only entries with maximum number of matched files
+ if not match_counts:
+ return ([], fileset_count)
+
+ max_match = max(match_counts.values())
+ candidates = [fid for fid, count in match_counts.items() if count == max_match]
+
+ matched_candidates = []
+ for candidate in candidates:
+ if is_full_detection_checksum_match(candidate, fileset, conn):
+ matched_candidates.append(candidate)
+
+ if len(matched_candidates) != 0:
+ candidates = matched_candidates
+
+ return (candidates, fileset_count)
+
+
+def is_candidate_by_checksize(candidate, fileset, conn):
+ with conn.cursor() as cursor:
cursor.execute(
- query, (fileset_id, fileset["sourcefile"], transaction_id, fileset_id)
+ "SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE detection=1 AND fileset = %s",
+ (candidate,),
)
- rows = cursor.fetchall()
-
- candidates = []
- if rows:
- for row in rows:
- candidates.append(row["fileset_id"])
+ target_files = cursor.fetchall()
+ candidate_files = {
+ target_file["id"]: [target_file["name"], target_file["size"]]
+ for target_file in target_files
+ }
- return candidates
+ # set of (checksum, filename)
+ scan_checksums = set()
+ for file in fileset["rom"]:
+ for key in file:
+ if key.startswith("md5"):
+ name = os.path.basename(normalised_path(file["name"]))
+ scan_checksums.add((file[key], name.lower()))
+
+ for detection_file_id, [
+ detection_file_name,
+ detection_file_size,
+ ] in candidate_files.items():
+ query = """
+ SELECT fc.checksum, fc.checksize, fc.checktype
+ FROM filechecksum fc
+ WHERE fc.file = %s
+ """
+ cursor.execute(query, (detection_file_id,))
+ checksums_info = cursor.fetchall()
+ if checksums_info:
+ for checksum_info in checksums_info:
+ checksum = checksum_info["checksum"]
+ checksize = checksum_info["checksize"]
+ if checksize == "1M":
+ checksize = 1048576
+ if (
+ (
+ checksum,
+ os.path.basename(detection_file_name.lower()),
+ )
+ not in scan_checksums
+ and detection_file_size <= int(checksize)
+ and detection_file_size != -1
+ ):
+ continue
+ else:
+ return True
+ return False
def process_fileset(
@@ -1374,13 +2435,13 @@ def find_matching_filesets(fileset, conn, status):
checksize, checktype, checksum = get_checksum_props(
checktype, checksum
)
- query = f"""SELECT DISTINCT fs.id AS fileset_id
+ query = """SELECT DISTINCT fs.id AS fileset_id
FROM fileset fs
JOIN file f ON fs.id = f.fileset
JOIN filechecksum fc ON f.id = fc.file
- WHERE fc.checksum = '{checksum}' AND fc.checktype = '{checktype}'
- AND fs.status IN ({state})"""
- cursor.execute(query)
+ WHERE fc.checksum = %s AND fc.checktype = %s
+ AND fs.status IN (%s)"""
+ cursor.execute(query, (checksum, checktype, state))
records = cursor.fetchall()
if records:
for record in records:
@@ -1403,16 +2464,16 @@ def matching_set(fileset, conn):
checksum = checksum.split(":")[1]
size = file["size"]
- query = f"""
+ query = """
SELECT DISTINCT fs.id AS fileset_id
FROM fileset fs
JOIN file f ON fs.id = f.fileset
JOIN filechecksum fc ON f.id = fc.file
- WHERE fc.checksum = '{checksum}' AND fc.checktype LIKE 'md5%'
- AND fc.checksize > {size}
+ WHERE fc.checksum = %s AND fc.checktype LIKE 'md5%'
+ AND fc.checksize > %s
AND fs.status = 'detection'
"""
- cursor.execute(query)
+ cursor.execute(query, (checksum, size))
records = cursor.fetchall()
if records:
for record in records:
@@ -1442,11 +2503,12 @@ def handle_matched_filesets(
if is_full_matched:
break
cursor.execute(
- f"SELECT status FROM fileset WHERE id = {matched_fileset_id}"
+ "SELECT status FROM fileset WHERE id = %s", (matched_fileset_id,)
)
status = cursor.fetchone()["status"]
cursor.execute(
- f"SELECT COUNT(file.id) FROM file WHERE fileset = {matched_fileset_id}"
+ "SELECT COUNT(file.id) FROM file WHERE fileset = %s",
+ (matched_fileset_id,),
)
count = cursor.fetchone()["COUNT(file.id)"]
@@ -1492,28 +2554,31 @@ def handle_matched_filesets(
def delete_original_fileset(fileset_id, conn):
with conn.cursor() as cursor:
- cursor.execute(f"DELETE FROM file WHERE fileset = {fileset_id}")
- cursor.execute(f"DELETE FROM fileset WHERE id = {fileset_id}")
+ cursor.execute("DELETE FROM file WHERE fileset = %s", (fileset_id,))
+ cursor.execute("DELETE FROM fileset WHERE id = %s", (fileset_id,))
conn.commit()
def update_fileset_status(cursor, fileset_id, status):
- cursor.execute(f"""
+ cursor.execute(
+ """
UPDATE fileset SET
- status = '{status}',
- `timestamp` = FROM_UNIXTIME({int(time.time())})
- WHERE id = {fileset_id}
- """)
+ status = %s,
+ `timestamp` = FROM_UNIXTIME(%s)
+ WHERE id = %s
+ """,
+ (status, int(time.time()), fileset_id),
+ )
def populate_file(fileset, fileset_id, conn, detection):
with conn.cursor() as cursor:
- cursor.execute(f"SELECT * FROM file WHERE fileset = {fileset_id}")
+ cursor.execute("SELECT * FROM file WHERE fileset = %s", (fileset_id,))
target_files = cursor.fetchall()
target_files_dict = {}
for target_file in target_files:
cursor.execute(
- f"SELECT * FROM filechecksum WHERE file = {target_file['id']}"
+ "SELECT * FROM filechecksum WHERE file = %s", (target_file["id"],)
)
target_checksums = cursor.fetchall()
for checksum in target_checksums:
@@ -1579,7 +2644,7 @@ def populate_file(fileset, fileset_id, conn, detection):
for key, value in file.items():
if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]:
- insert_filechecksum(file, key, conn)
+ insert_filechecksum(file, key, file_id, conn)
if value in target_files_dict and not file_exists:
cursor.execute(
f"SELECT detection_type FROM file WHERE id = {target_files_dict[value]['id']}"
@@ -1638,7 +2703,8 @@ def set_populate_file(fileset, fileset_id, conn, detection):
with conn.cursor() as cursor:
# Extracting the filename from the filepath.
cursor.execute(
- f"SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = {fileset_id}"
+ "SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = %s",
+ (fileset_id,),
)
target_files = cursor.fetchall()
candidate_files = {
@@ -1646,6 +2712,13 @@ def set_populate_file(fileset, fileset_id, conn, detection):
for target_file in target_files
}
+ # For glk engines
+ candidate_file_size = {
+ target_file["size"]: target_file["id"] for target_file in target_files
+ }
+
+ engine_name = fileset["sourcefile"].split("-")[0]
+
seen_detection_files = set()
for file in fileset["rom"]:
@@ -1655,35 +2728,38 @@ def set_populate_file(fileset, fileset_id, conn, detection):
filename = os.path.basename(normalised_path(file["name"]))
- if ((filename.lower(), file["size"]) in seen_detection_files) or (
- filename.lower() not in candidate_files
- or (
- filename.lower() in candidate_files
- and (
- candidate_files[filename.lower()][1] != -1
- and candidate_files[filename.lower()][1] != file["size"]
+ if (engine_name == "glk" and file["size"] not in candidate_file_size) or (
+ engine_name != "glk"
+ and (
+ (filename.lower(), file["size"]) in seen_detection_files
+ or (
+ filename.lower() not in candidate_files
+ or (
+ filename.lower() in candidate_files
+ and (
+ candidate_files[filename.lower()][1] != -1
+ and candidate_files[filename.lower()][1] != file["size"]
+ )
+ )
)
)
):
name = normalised_path(file["name"])
values = [name]
-
values.append(file["size"] if "size" in file else "0")
values.append(file["size-r"] if "size-r" in file else "0")
values.append(file["size-rd"] if "size-rd" in file else "0")
-
values.extend([checksum, fileset_id, detection, "None"])
- placeholders = (
- ["%s"] * (len(values[:5])) + ["%s"] + ["%s"] * 2 + ["NOW()"]
- )
- query = f"INSERT INTO file ( name, size, `size-r`, `size-rd`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ({', '.join(placeholders)})"
+ query = "INSERT INTO file ( name, size, `size-r`, `size-rd`, checksum, fileset, detection, detection_type, `timestamp` ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, NOW())"
cursor.execute(query, values)
cursor.execute("SET @file_last = LAST_INSERT_ID()")
cursor.execute("SELECT @file_last AS file_id")
- insert_filechecksum(file, "md5", conn)
+ file_id = cursor.fetchone()["file_id"]
+
+ insert_filechecksum(file, "md5", file_id, conn)
else:
query = """
@@ -1692,15 +2768,19 @@ def set_populate_file(fileset, fileset_id, conn, detection):
name = %s
WHERE id = %s
"""
+
# Filtering was by filename, but we are still updating the file with the original filepath.
cursor.execute(
query,
(
file["size"],
normalised_path(file["name"]),
- candidate_files[filename.lower()][0],
+ candidate_files[filename.lower()][0]
+ if engine_name != "glk"
+ else candidate_file_size[file["size"]],
),
)
+
query = """
INSERT INTO filechecksum (file, checksize, checktype, checksum)
VALUES (%s, %s, %s, %s)
@@ -1708,12 +2788,24 @@ def set_populate_file(fileset, fileset_id, conn, detection):
cursor.execute(
query,
(
- candidate_files[filename.lower()][0],
+ candidate_files[filename.lower()][0]
+ if engine_name != "glk"
+ else candidate_file_size[file["size"]],
checksize,
checktype,
checksum,
),
)
+
+ add_all_equal_checksums(
+ checksize,
+ checktype,
+ checksum,
+ candidate_files[filename.lower()][0]
+ if engine_name != "glk"
+ else candidate_file_size[file["size"]],
+ conn,
+ )
seen_detection_files.add((filename.lower(), file["size"]))
@@ -1727,6 +2819,7 @@ def insert_new_fileset(
transaction_id,
log_text,
user,
+ set_dat_metadata="",
ip="",
skiplog=False,
):
@@ -1739,28 +2832,32 @@ def insert_new_fileset(
log_text,
conn,
username=user,
+ set_dat_metadata=set_dat_metadata,
ip=ip,
skiplog=skiplog,
)
if fileset_id:
for file in fileset["rom"]:
insert_file(file, detection, src, conn)
+ file_id = None
+ with conn.cursor() as cursor:
+ cursor.execute("SELECT @file_last AS file_id")
+ file_id = cursor.fetchone()["file_id"]
for key, value in file.items():
- if key not in ["name", "size", "size-r", "size-rd", "sha1", "crc"]:
- insert_filechecksum(file, key, conn)
+ if key not in [
+ "name",
+ "size",
+ "size-r",
+ "size-rd",
+ "sha1",
+ "crc",
+ "modification-time",
+ ]:
+ insert_filechecksum(file, key, file_id, conn)
return (fileset_id, existing)
def log_matched_fileset(src, fileset_last, fileset_id, state, user, conn):
- category_text = f"Matched from {src}"
- log_text = f"Matched Fileset:{fileset_id}. State {state}."
- log_last = create_log(
- escape_string(category_text), user, escape_string(log_text), conn
- )
- update_history(fileset_last, fileset_id, conn, log_last)
-
-
-def set_log_matched_fileset(src, fileset_last, fileset_id, state, user, conn):
category_text = f"Matched from {src}"
log_text = (
f"Matched Fileset:{fileset_last} with Fileset:{fileset_id}. State {state}."
@@ -1771,12 +2868,34 @@ def set_log_matched_fileset(src, fileset_last, fileset_id, state, user, conn):
update_history(fileset_last, fileset_id, conn, log_last)
+def log_scan_match_with_full(
+ fileset_last,
+ candidate_id,
+ unmatched_candidate_files,
+ unmatched_scan_files,
+ fully_matched,
+ user,
+ conn,
+):
+ category_text = "Mismatch with Full set"
+ if fully_matched:
+ category_text = "Existing as Full set."
+ log_text = f"""Files mismatched with Full Fileset:{candidate_id}. Unmatched Files in scan fileset = {len(unmatched_scan_files)}. Unmatched Files in full fileset = {len(unmatched_candidate_files)}. List of unmatched files scan.dat : {", ".join(scan_file for scan_file in unmatched_scan_files)}, List of unmatched files full fileset : {", ".join(scan_file for scan_file in unmatched_candidate_files)}"""
+ if fully_matched:
+ log_text = (
+ f"Fileset matched completely with Full Fileset:{candidate_id}. Dropping."
+ )
+ print(log_text)
+ create_log(escape_string(category_text), user, escape_string(log_text), conn)
+
+
def finalize_fileset_insertion(
conn, transaction_id, src, filepath, author, version, source_status, user
):
with conn.cursor() as cursor:
cursor.execute(
- f"SELECT COUNT(fileset) from transactions WHERE `transaction` = {transaction_id}"
+ "SELECT COUNT(fileset) from transactions WHERE `transaction` = %s",
+ (transaction_id,),
)
fileset_insertion_count = cursor.fetchone()["COUNT(fileset)"]
category_text = f"Uploaded from {src}"
@@ -1814,7 +2933,7 @@ def user_integrity_check(data, ip, game_metadata=None):
print(f"Failed to connect to database: {e}")
return
- conn.cursor().execute(f"SET @fileset_time_last = {int(time.time())}")
+ conn.cursor().execute("SET @fileset_time_last = %s", (int(time.time()),))
try:
with conn.cursor() as cursor:
@@ -1839,12 +2958,13 @@ def user_integrity_check(data, ip, game_metadata=None):
missing_set = set()
for fileset_id in matched_map.keys():
- cursor.execute(f"SELECT * FROM file WHERE fileset = {fileset_id}")
+ cursor.execute("SELECT * FROM file WHERE fileset = %s", (fileset_id,))
target_files = cursor.fetchall()
target_files_dict = {}
for target_file in target_files:
cursor.execute(
- f"SELECT * FROM filechecksum WHERE file = {target_file['id']}"
+ "SELECT * FROM filechecksum WHERE file = %s",
+ (target_file["id"],),
)
target_checksums = cursor.fetchall()
for checksum in target_checksums:
@@ -1924,12 +3044,13 @@ def user_integrity_check(data, ip, game_metadata=None):
most_matched = matched_list[0]
matched_fileset_id, matched_count = most_matched[0], most_matched[1]
cursor.execute(
- f"SELECT status FROM fileset WHERE id = {matched_fileset_id}"
+ "SELECT status FROM fileset WHERE id = %s", (matched_fileset_id,)
)
status = cursor.fetchone()["status"]
cursor.execute(
- f"SELECT COUNT(file.id) FROM file WHERE fileset = {matched_fileset_id}"
+ "SELECT COUNT(file.id) FROM file WHERE fileset = %s",
+ (matched_fileset_id,),
)
count = cursor.fetchone()["COUNT(file.id)"]
if status == "full" and count == matched_count:
@@ -1967,11 +3088,47 @@ def user_integrity_check(data, ip, game_metadata=None):
def add_usercount(fileset, conn):
with conn.cursor() as cursor:
cursor.execute(
- f"UPDATE fileset SET user_count = COALESCE(user_count, 0) + 1 WHERE id = {fileset}"
+ "UPDATE fileset SET user_count = COALESCE(user_count, 0) + 1 WHERE id = %s",
+ (fileset,),
)
- cursor.execute(f"SELECT user_count from fileset WHERE id = {fileset}")
+ cursor.execute("SELECT user_count from fileset WHERE id = %s", (fileset,))
count = cursor.fetchone()["user_count"]
if count >= 3:
cursor.execute(
- f"UPDATE fileset SET status = 'ReadyForReview' WHERE id = {fileset}"
+ "UPDATE fileset SET status = 'ReadyForReview' WHERE id = %s", (fileset,)
)
+
+
+def console_log(message):
+ sys.stdout.write(" " * 50 + "\r")
+ sys.stdout.flush()
+ print(message)
+
+
+def console_log_candidate_filtering(fileset_count):
+ sys.stdout.write(f"Filtering Candidates - Fileset {fileset_count}\r")
+ sys.stdout.flush()
+
+
+def console_log_file_update(fileset_count):
+ sys.stdout.write(f"Updating files - Fileset {fileset_count}\r")
+ sys.stdout.flush()
+
+
+def console_log_matching(fileset_count):
+ sys.stdout.write(f"Performing Match - Fileset {fileset_count}\r")
+ sys.stdout.flush()
+
+
+def console_log_detection(fileset_count):
+ sys.stdout.write(f"Processing - Fileset {fileset_count}\r")
+ sys.stdout.flush()
+
+
+def console_log_total_filesets(file_path):
+ count = 0
+ with open(file_path, "r") as f:
+ for line in f:
+ if line.strip().startswith("game ("):
+ count += 1
+ print(f"Total filesets present - {count}.")
diff --git a/fileset.py b/fileset.py
index 9b9dc935..605d831f 100644
--- a/fileset.py
+++ b/fileset.py
@@ -15,7 +15,6 @@
)
from pagination import create_page
import difflib
-from pymysql.converters import escape_string
from db_functions import (
find_matching_filesets,
get_all_related_filesets,
@@ -23,28 +22,15 @@
user_integrity_check,
db_connect,
create_log,
+ db_connect_root,
)
from collections import defaultdict
+from schema import init_database
app = Flask(__name__)
secret_key = os.urandom(24)
-base_dir = os.path.dirname(os.path.abspath(__file__))
-config_path = os.path.join(base_dir, "mysql_config.json")
-with open(config_path) as f:
- mysql_cred = json.load(f)
-
-conn = pymysql.connect(
- host=mysql_cred["servername"],
- user=mysql_cred["username"],
- password=mysql_cred["password"],
- db=mysql_cred["dbname"],
- charset="utf8mb4",
- cursorclass=pymysql.cursors.DictCursor,
- autocommit=False,
-)
-
@app.route("/")
def index():
@@ -55,7 +41,12 @@ def index():
- Fileset Database
+
+
+
+
+
+ Fileset Database
Fileset Actions
Fileset
@@ -79,21 +70,13 @@ def index():
@app.route("/clear_database", methods=["POST"])
def clear_database():
try:
- conn = db_connect()
+ (conn, db_name) = db_connect_root()
with conn.cursor() as cursor:
- cursor.execute("SET FOREIGN_KEY_CHECKS = 0;")
- cursor.execute("TRUNCATE TABLE filechecksum")
- cursor.execute("TRUNCATE TABLE history")
- cursor.execute("TRUNCATE TABLE transactions")
- cursor.execute("TRUNCATE TABLE queue")
- cursor.execute("TRUNCATE TABLE file")
- cursor.execute("TRUNCATE TABLE fileset")
- cursor.execute("TRUNCATE TABLE game")
- cursor.execute("TRUNCATE TABLE engine")
- cursor.execute("TRUNCATE TABLE log")
- cursor.execute("SET FOREIGN_KEY_CHECKS = 1;")
+ cursor.execute("DROP DATABASE IF EXISTS %s", (db_name,))
conn.commit()
- print("DATABASE CLEARED")
+ print("DATABASE DROPPED")
+ init_database()
+ print("DATABASE INITIALISED")
except Exception as e:
print(f"Error clearing database: {e}")
finally:
@@ -139,15 +122,18 @@ def fileset():
id = max(min_id, min(id, max_id))
# Check if the id exists in the fileset table
- cursor.execute(f"SELECT id FROM fileset WHERE id = {id}")
+ cursor.execute("SELECT id FROM fileset WHERE id = %s", (id,))
if cursor.rowcount == 0:
# If the id doesn't exist, get a new id from the history table
- cursor.execute(f"SELECT fileset FROM history WHERE oldfileset = {id}")
+ cursor.execute(
+ "SELECT fileset FROM history WHERE oldfileset = %s", (id,)
+ )
id = cursor.fetchone()["fileset"]
# Get the history for the current id
cursor.execute(
- f"SELECT `timestamp`, oldfileset, log FROM history WHERE fileset = {id} ORDER BY `timestamp`"
+ "SELECT `timestamp`, oldfileset, log FROM history WHERE fileset = %s ORDER BY `timestamp`",
+ (id,),
)
history = cursor.fetchall()
@@ -159,7 +145,12 @@ def fileset():
- Fileset: {id}
+
+
+
+
+
+ Fileset: {id}
"""
html += f"Manual Merge "
@@ -176,18 +167,31 @@ def fileset():
(id,),
)
row = cursor.fetchone()
- print(row)
if row:
id = row["fileset"]
- cursor.execute(f"SELECT * FROM fileset WHERE id = {id}")
+ cursor.execute("SELECT status FROM fileset WHERE id = %s", (id,))
+ status = cursor.fetchone()["status"]
+
+ if status == "dat":
+ cursor.execute(
+ """SELECT id, game, status, src, `key`, megakey, `delete`, timestamp, set_dat_metadata FROM fileset WHERE id = %s""",
+ (id,),
+ )
+ else:
+ cursor.execute(
+ """SELECT id, game, status, src, `key`, megakey, `delete`, timestamp, detection_size, user_count FROM fileset WHERE id = %s""",
+ (id,),
+ )
+
result = cursor.fetchone()
- print(result)
html += "Fileset details "
html += "\n"
if result["game"]:
- cursor.execute(
- f"SELECT game.name as 'game name', engineid, gameid, extra, platform, language FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = {id}"
- )
+ if status == "dat":
+ query = """SELECT game.name as 'game name', engineid, gameid, extra, platform, language, fileset.set_dat_metadata FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = %s"""
+ else:
+ query = """SELECT game.name as 'game name', engineid, gameid, extra, platform, language FROM fileset JOIN game ON game.id = fileset.game JOIN engine ON engine.id = game.engine WHERE fileset.id = %s"""
+ cursor.execute(query, (id,))
result = {**result, **cursor.fetchone()}
else:
# result.pop('key', None)
@@ -237,6 +241,7 @@ def fileset():
"detection",
"detection_type",
"timestamp",
+ "modification-time",
]
if sort:
@@ -247,13 +252,11 @@ def fileset():
if "desc" in sort:
order += " DESC"
- columns_to_select = "file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp`"
+ columns_to_select = "file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp`, `modification-time`"
columns_to_select += ", ".join(md5_columns)
- print(
- f"SELECT file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp` FROM file WHERE fileset = {id} {order}"
- )
cursor.execute(
- f"SELECT file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp` FROM file WHERE fileset = {id} {order}"
+ "SELECT file.id, name, size, `size-r`, `size-rd`, checksum, detection, detection_type, `timestamp`, `modification-time` FROM file WHERE fileset = %s %s",
+ (id, order),
)
result = cursor.fetchall()
@@ -263,7 +266,8 @@ def fileset():
if widetable == "full":
file_ids = [file["id"] for file in result]
cursor.execute(
- f"SELECT file, checksum, checksize, checktype FROM filechecksum WHERE file IN ({','.join(map(str, file_ids))})"
+ "SELECT file, checksum, checksize, checktype FROM filechecksum WHERE file IN (%s)",
+ (",".join(map(str, file_ids)),),
)
checksums = cursor.fetchall()
@@ -330,7 +334,8 @@ def fileset():
if "delete" in request.form:
cursor.execute(
- f"UPDATE fileset SET `delete` = TRUE WHERE id = {request.form['delete']}"
+ "UPDATE fileset SET `delete` = TRUE WHERE id = %s",
+ (request.form["delete"],),
)
connection.commit()
html += "Fileset marked for deletion
"
@@ -341,7 +346,8 @@ def fileset():
# Generate the HTML for the fileset history
cursor.execute(
- f"SELECT `timestamp`, category, `text`, id FROM log WHERE `text` REGEXP 'Fileset:{id}' ORDER BY `timestamp` DESC, id DESC"
+ "SELECT `timestamp`, category, `text`, id FROM log WHERE `text` REGEXP 'Fileset:%s' ORDER BY `timestamp` DESC, id DESC",
+ (id,),
)
# cursor.execute(f"SELECT `timestamp`, fileset, oldfileset FROM history WHERE fileset = {id} ORDER BY `timestamp` DESC")
@@ -354,17 +360,22 @@ def fileset():
html += "Description \n"
html += "Log Text \n"
- related_filesets = get_all_related_filesets(id, conn)
+ related_filesets = get_all_related_filesets(id, connection)
cursor.execute(
- f"SELECT * FROM history WHERE fileset IN ({','.join(map(str, related_filesets))}) OR oldfileset IN ({','.join(map(str, related_filesets))})"
+ "SELECT * FROM history WHERE fileset IN (%s) OR oldfileset IN (%s)",
+ (
+ ",".join(map(str, related_filesets)),
+ ",".join(map(str, related_filesets)),
+ ),
)
history = cursor.fetchall()
print(f"History: {history}")
for h in history:
cursor.execute(
- f"SELECT `timestamp`, category, `text`, id FROM log WHERE `text` LIKE 'Fileset:{h['oldfileset']}' ORDER BY `timestamp` DESC, id DESC"
+ "SELECT `timestamp`, category, `text`, id FROM log WHERE `text` LIKE 'Fileset:%s' ORDER BY `timestamp` DESC, id DESC",
+ (h["oldfileset"],),
)
logs = cursor.fetchall()
print(f"Logs: {logs}")
@@ -378,7 +389,9 @@ def fileset():
html += f"Created fileset Fileset {h['fileset']} \n"
# html += f"Log {h['log']} \n"
if h["log"]:
- cursor.execute(f"SELECT `text` FROM log WHERE id = {h['log']}")
+ cursor.execute(
+ "SELECT `text` FROM log WHERE id = %s", (h["log"],)
+ )
log_text = cursor.fetchone()["text"]
log_text = convert_log_text_to_links(log_text)
html += f"Log {h['log']} : {log_text} \n"
@@ -393,7 +406,7 @@ def fileset():
html += f"Fileset {h['oldfileset']} merged into fileset Fileset {h['fileset']} \n"
# html += f"Log {h['log']} \n"
if h["log"]:
- cursor.execute(f"SELECT `text` FROM log WHERE id = {h['log']}")
+ cursor.execute("SELECT `text` FROM log WHERE id = %s", (h["log"],))
log_text = cursor.fetchone()["text"]
log_text = convert_log_text_to_links(log_text)
html += f"Log {h['log']} : {log_text} \n"
@@ -425,21 +438,23 @@ def match_fileset_route(id):
try:
with connection.cursor() as cursor:
- cursor.execute(f"SELECT * FROM fileset WHERE id = {id}")
+ cursor.execute("SELECT * FROM fileset WHERE id = %s", (id,))
fileset = cursor.fetchone()
fileset["rom"] = []
if not fileset:
return f"No fileset found with id {id}", 404
cursor.execute(
- f"SELECT file.id, name, size, checksum, detection, detection_type FROM file WHERE fileset = {id}"
+ "SELECT file.id, name, size, checksum, detection, detection_type FROM file WHERE fileset = %s",
+ (id,),
)
result = cursor.fetchall()
file_ids = {}
for file in result:
file_ids[file["id"]] = (file["name"], file["size"])
cursor.execute(
- f"SELECT file, checksum, checksize, checktype FROM filechecksum WHERE file IN ({','.join(map(str, file_ids.keys()))})"
+ "SELECT file, checksum, checksize, checktype FROM filechecksum WHERE file IN (%s)",
+ (",".join(map(str, file_ids.keys())),),
)
files = cursor.fetchall()
@@ -473,7 +488,12 @@ def match_fileset_route(id):
- Matched Filesets for Fileset: {id}
+
+
+
+
+
+ Matched Filesets for Fileset: {id}
Fileset ID
@@ -486,7 +506,7 @@ def match_fileset_route(id):
if fileset_id == id:
continue
cursor.execute(
- f"SELECT COUNT(file.id) FROM file WHERE fileset = {fileset_id}"
+ "SELECT COUNT(file.id) FROM file WHERE fileset = %s", (fileset_id,)
)
count = cursor.fetchone()["COUNT(file.id)"]
html += f"""
@@ -560,7 +580,12 @@ def merge_fileset(id):
- Search Results for '{search_query}'
+
+
+
+
+
+ Search Results for '{search_query}'