Skip to content

Commit 365b4f2

Browse files
ShivangNagtasev-
authored andcommitted
INTEGRITY: Filter files by filename instead of entire path, as detections do not necessarily store the entire filepath.
1 parent 22d9131 commit 365b4f2

File tree

1 file changed

+31
-10
lines changed

1 file changed

+31
-10
lines changed

db_functions.py

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,14 +1240,18 @@ def set_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn):
12401240
matched_detection_files AS (
12411241
SELECT cf.fileset_id, COUNT(*) AS match_files_count
12421242
FROM candidate_fileset cf
1243-
JOIN set_fileset sf ON cf.name = sf.name AND (cf.size = sf.size OR cf.size = -1)
1243+
JOIN set_fileset sf ON ( (
1244+
cf.name = sf.name
1245+
OR
1246+
REGEXP_REPLACE(cf.name, '^.*[\\\\/]', '') = REGEXP_REPLACE(sf.name, '^.*[\\\\/]', '')
1247+
) AND (cf.size = sf.size OR cf.size = -1) )
12441248
GROUP BY cf.fileset_id
12451249
),
12461250
valid_matched_detection_files AS (
12471251
SELECT mdf.fileset_id, mdf.match_files_count AS valid_match_files_count
12481252
FROM matched_detection_files mdf
12491253
JOIN total_detection_files tdf ON tdf.fileset_id = mdf.fileset_id
1250-
WHERE tdf.detection_files_found = mdf.match_files_count
1254+
WHERE tdf.detection_files_found <= mdf.match_files_count
12511255
),
12521256
max_match_count AS (
12531257
SELECT MAX(valid_match_files_count) AS max_count FROM valid_matched_detection_files
@@ -1256,7 +1260,6 @@ def set_filter_candidate_filesets(fileset_id, fileset, transaction_id, conn):
12561260
FROM valid_matched_detection_files vmdf
12571261
JOIN total_detection_files tdf ON vmdf.fileset_id = tdf.fileset_id
12581262
JOIN max_match_count mmc ON vmdf.valid_match_files_count = mmc.max_count
1259-
WHERE vmdf.valid_match_files_count = tdf.detection_files_found;
12601263
"""
12611264

12621265
cursor.execute(
@@ -1619,13 +1622,16 @@ def populate_file(fileset, fileset_id, conn, detection):
16191622

16201623
def set_populate_file(fileset, fileset_id, conn, detection):
16211624
"""
1622-
TODO
1625+
Updates the old fileset in case of a match. Further deletes the newly created fileset which is not needed anymore.
16231626
"""
16241627
with conn.cursor() as cursor:
1625-
cursor.execute(f"SELECT id, name FROM file WHERE fileset = {fileset_id}")
1628+
# Extracting the filename from the filepath.
1629+
cursor.execute(
1630+
f"SELECT id, REGEXP_REPLACE(name, '^.*[\\\\/]', '') AS name, size FROM file WHERE fileset = {fileset_id}"
1631+
)
16261632
target_files = cursor.fetchall()
16271633
candidate_files = {
1628-
target_file["name"].lower(): target_file["id"]
1634+
target_file["name"].lower(): [target_file["id"], target_file["size"]]
16291635
for target_file in target_files
16301636
}
16311637

@@ -1634,7 +1640,15 @@ def set_populate_file(fileset, fileset_id, conn, detection):
16341640
continue
16351641
checksize, checktype, checksum = get_checksum_props("md5", file["md5"])
16361642

1637-
if file["name"].lower() not in candidate_files:
1643+
filename = os.path.basename(normalised_path(file["name"]))
1644+
1645+
if filename.lower() not in candidate_files or (
1646+
filename.lower() in candidate_files
1647+
and (
1648+
candidate_files[filename.lower()][1] != -1
1649+
and candidate_files[filename.lower()][1] != file["size"]
1650+
)
1651+
):
16381652
name = normalised_path(file["name"])
16391653
values = [name]
16401654

@@ -1658,11 +1672,18 @@ def set_populate_file(fileset, fileset_id, conn, detection):
16581672
else:
16591673
query = """
16601674
UPDATE file
1661-
SET size = %s
1675+
SET size = %s,
1676+
name = %s
16621677
WHERE id = %s
16631678
"""
1679+
# Filtering was by filename, but we are still updating the file with the original filepath.
16641680
cursor.execute(
1665-
query, (file["size"], candidate_files[file["name"].lower()])
1681+
query,
1682+
(
1683+
file["size"],
1684+
normalised_path(file["name"]),
1685+
candidate_files[filename.lower()][0],
1686+
),
16661687
)
16671688
query = """
16681689
INSERT INTO filechecksum (file, checksize, checktype, checksum)
@@ -1671,7 +1692,7 @@ def set_populate_file(fileset, fileset_id, conn, detection):
16711692
cursor.execute(
16721693
query,
16731694
(
1674-
candidate_files[file["name"].lower()],
1695+
candidate_files[filename.lower()][0],
16751696
checksize,
16761697
checktype,
16771698
checksum,

0 commit comments

Comments
 (0)