@@ -1059,6 +1059,7 @@ def scan_process(
1059
1059
create_log (escape_string (category_text ), user , escape_string (log_text ), conn )
1060
1060
category_text = "Upload information"
1061
1061
log_text = f"Number of filesets: { fileset_insertion_count } . Filesets automatically merged: { automatic_merged_filesets } . Filesets requiring manual merge (multiple candidates): { manual_merged_filesets } . Filesets requiring manual merge (matched with detection): { manual_merged_with_detection } . Filesets dropped, no candidate: { dropped_early_no_candidate } . Filesets matched with existing Full fileset: { match_with_full_fileset } . Filesets with mismatched files with Full fileset: { mismatch_with_full_fileset } . Filesets missing files compared to partial fileset candidate: { filesets_with_missing_files } ."
1062
+ console_log (log_text )
1062
1063
create_log (escape_string (category_text ), user , escape_string (log_text ), conn )
1063
1064
1064
1065
@@ -1115,9 +1116,12 @@ def pre_update_files(rom, filesets_check_for_full, transaction_id, conn):
1115
1116
SET size = %s,
1116
1117
`size-r` = %s,
1117
1118
`size-rd` = %s,
1119
+ name = %s
1118
1120
WHERE id = %s
1119
1121
"""
1120
- cursor .execute (query , size , size_r , size_rd , file_id )
1122
+ cursor .execute (
1123
+ query , (size , size_r , size_rd , normalised_path (rom ["name" ]), file_id )
1124
+ )
1121
1125
1122
1126
1123
1127
def scan_perform_match (
@@ -1396,73 +1400,146 @@ def total_fileset_files(fileset):
1396
1400
1397
1401
def scan_filter_candidate_filesets (fileset_id , fileset , transaction_id , conn ):
1398
1402
"""
1399
- Returns a list of candidate filesets that can be merged
1403
+ Returns a list of candidate filesets that can be merged.
1404
+ Performs early filtering in SQL (by name, size) and then
1405
+ applies checksum filtering and max-match filtering in Python.
1400
1406
"""
1401
1407
with conn .cursor () as cursor :
1402
- # Returns those filesets which have all detection files matching in the scan fileset filtered by file name and file size(if not -1).
1403
-
1408
+ # Fetching detection filename and all sizes (size, size-r, size-rd) from database
1404
1409
query = """
1405
- WITH candidate_fileset AS (
1406
- SELECT fs.id AS fileset_id, f.name, f.size,
1410
+ SELECT fs.id AS fileset_id, f.id as file_id, f.name, f.size,
1407
1411
f.`size-r` AS size_r, f.`size-rd` AS size_rd
1408
1412
FROM file f
1409
1413
JOIN fileset fs ON f.fileset = fs.id
1410
1414
JOIN game g ON g.id = fs.game
1415
+ JOIN engine e ON e.id = g.engine
1411
1416
JOIN transactions t ON t.fileset = fs.id
1412
1417
WHERE f.detection = 1
1413
1418
AND t.transaction != %s
1414
- ),
1415
- total_detection_files AS (
1416
- SELECT cf.fileset_id, COUNT(*) AS detection_files_found
1417
- FROM candidate_fileset cf
1418
- GROUP BY fileset_id
1419
- ),
1420
- set_fileset AS (
1421
- SELECT name, size,
1422
- `size-r` AS size_r, `size-rd` AS size_rd
1423
- FROM file
1424
- WHERE fileset = %s
1425
- ),
1426
- matched_detection_files AS (
1427
- SELECT cf.fileset_id, COUNT(*) AS match_files_count
1428
- FROM candidate_fileset cf
1429
- JOIN set_fileset sf ON ( (
1430
- cf.name = sf.name
1431
- OR
1432
- REGEXP_REPLACE(cf.name, '^.*[\\ \\ /]', '') = REGEXP_REPLACE(sf.name, '^.*[\\ \\ /]', '')
1433
- ) AND (cf.size = sf.size OR cf.size = -1)
1434
- AND (cf.size_r = sf.size_r)
1435
- AND (cf.size_rd = sf.size_rd))
1436
- GROUP BY cf.fileset_id
1437
- ),
1438
- valid_matched_detection_files AS (
1439
- SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
1440
- FROM matched_detection_files mdf
1441
- JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
1442
- WHERE tdf.detection_files_found <= mdf.match_files_count
1443
- ),
1444
- max_match_count AS (
1445
- SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
1446
- )
1447
- SELECT vmdf.fileset_id
1448
- FROM valid_matched_detection_files vmdf
1449
- JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
1450
- JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
1451
1419
"""
1420
+ cursor .execute (query , (transaction_id ,))
1421
+ raw_candidates = cursor .fetchall ()
1422
+
1423
+ # fileset id to detection files map
1424
+ candidate_map = defaultdict (list )
1425
+ total_detection_files_map = defaultdict (int )
1426
+ for row in raw_candidates :
1427
+ candidate_map [row ["fileset_id" ]].append (
1428
+ {
1429
+ "file_id" : row ["file_id" ],
1430
+ "name" : os .path .basename (normalised_path (row ["name" ])).lower (),
1431
+ "size" : row ["size" ],
1432
+ "size-r" : row ["size_r" ],
1433
+ "size-rd" : row ["size_rd" ],
1434
+ }
1435
+ )
1436
+ for id , files in candidate_map .items ():
1437
+ total_detection_files_map [id ] = len (files )
1438
+
1439
+ set_checksums = set ()
1440
+ set_file_name_size = set ()
1441
+ for file in fileset ["rom" ]:
1442
+ name = os .path .basename (normalised_path (file ["name" ]))
1443
+ for key in file :
1444
+ if key .startswith ("md5" ):
1445
+ set_checksums .add (
1446
+ (
1447
+ file [key ],
1448
+ name .lower (),
1449
+ int (file ["size" ]),
1450
+ int (file ["size-r" ]),
1451
+ int (file ["size-rd" ]),
1452
+ )
1453
+ )
1454
+ set_checksums .add (
1455
+ (
1456
+ file [key ],
1457
+ name .lower (),
1458
+ - 1 ,
1459
+ int (file ["size-r" ]),
1460
+ int (file ["size-rd" ]),
1461
+ )
1462
+ )
1463
+ set_file_name_size .add (
1464
+ (name .lower (), - 1 , int (file ["size-r" ]), int (file ["size-rd" ]))
1465
+ )
1466
+ set_file_name_size .add (
1467
+ (name .lower (), int (file ["size" ]), int (file ["size-r" ]), int (file ["size-rd" ]))
1468
+ )
1469
+
1470
+ # Filter candidates by detection filename and file size (including -1) and increase matched file count
1471
+ # if filesize = -1,
1472
+ # elif filesize <= checksize and checksum matches,
1473
+ # elif filesize > checksize.
1474
+ match_counts = {}
1475
+ for fileset_id , files in candidate_map .items ():
1476
+ count = 0
1477
+ with conn .cursor () as cursor :
1478
+ for f in files :
1479
+ filename = os .path .basename (f ["name" ]).lower ()
1480
+ size = f ["size" ]
1481
+ size_r = f ["size-r" ]
1482
+ size_rd = f ["size-rd" ]
1483
+ if (filename , size , size_r , size_rd ) in set_file_name_size :
1484
+ if size == - 1 :
1485
+ count += 1
1486
+ else :
1487
+ cursor .execute (
1488
+ """
1489
+ SELECT checksum, checksize, checktype
1490
+ FROM filechecksum
1491
+ WHERE file = %s
1492
+ """ ,
1493
+ (f ["file_id" ],),
1494
+ )
1495
+ checksums = cursor .fetchall ()
1496
+ not_inc_count = False
1497
+ for c in checksums :
1498
+ filesize = size
1499
+ checksum = c ["checksum" ]
1500
+ checksize = c ["checksize" ]
1501
+ checktype = c ["checktype" ]
1502
+ # Macfiles handling
1503
+ if checktype in ["md5-r" , "md5-rt" ]:
1504
+ filesize = size_rd
1452
1505
1453
- cursor .execute (query , (transaction_id , fileset_id ))
1454
- rows = cursor .fetchall ()
1506
+ if checksize == "1M" :
1507
+ checksize = 1048576
1508
+ elif checksize == "0" :
1509
+ checksize = filesize
1510
+ if filesize <= int (checksize ):
1511
+ if (
1512
+ checksum ,
1513
+ filename ,
1514
+ size ,
1515
+ size_r ,
1516
+ size_rd ,
1517
+ ) in set_checksums :
1518
+ count += 1
1519
+ not_inc_count = True
1520
+ # if it was a true match, checksum should be present
1521
+ break
1522
+ if not not_inc_count :
1523
+ count += 1
1524
+ if count > 0 and total_detection_files_map [fileset_id ] <= count :
1525
+ match_counts [fileset_id ] = count
1526
+
1527
+ # Filter only entries with maximum number of matched files
1528
+ if not match_counts :
1529
+ return []
1455
1530
1456
- candidates = []
1457
- if rows :
1458
- for row in rows :
1459
- candidates .append (row ["fileset_id" ])
1531
+ max_match = max (match_counts .values ())
1532
+ candidates = [fid for fid , count in match_counts .items () if count == max_match ]
1460
1533
1461
- for candidate in candidates :
1462
- if not is_full_detection_checksum_match (candidate , fileset , conn ):
1463
- candidates .remove (candidate )
1534
+ matched_candidates = []
1535
+ for candidate in candidates :
1536
+ if is_full_detection_checksum_match (candidate , fileset , conn ):
1537
+ matched_candidates .append (candidate )
1538
+
1539
+ if len (matched_candidates ) != 0 :
1540
+ candidates = matched_candidates
1464
1541
1465
- return candidates
1542
+ return candidates
1466
1543
1467
1544
1468
1545
def get_unmatched_files (candidate_fileset , fileset , conn ):
0 commit comments