Skip to content

Commit c02689f

Browse files
committed
update 2024 analysis workflows
1 parent 333e022 commit c02689f

File tree

2 files changed

+166
-1089
lines changed

2 files changed

+166
-1089
lines changed

markdown/2024_insights/03_analysis.md

Lines changed: 64 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ jupyter:
1313
---
1414

1515
```python
16+
1617
import json
1718
from datetime import datetime
1819

@@ -42,6 +43,7 @@ Key insights calculated:
4243

4344

4445
```python
46+
# Calculate total vulnerabilities for each year
4547
total_vulns_2023 = df_2023.shape[0]
4648
total_vulns_2024 = df_2024.shape[0]
4749
percentage_change = round(((total_vulns_2024 - total_vulns_2023) / total_vulns_2023) * 100, 3)
@@ -52,11 +54,10 @@ total_vulnerabilities = {
5254
"percentage_change": percentage_change
5355
}
5456

55-
# 2023 Data
57+
# Calculate month with the most vulnerabilities for each year
5658
month_2023 = df_2023['Published_Month'].value_counts().idxmax()
5759
count_2023 = df_2023['Published_Month'].value_counts().max()
5860

59-
# 2024 Data
6061
month_2024 = df_2024['Published_Month'].value_counts().idxmax()
6162
count_2024 = df_2024['Published_Month'].value_counts().max()
6263

@@ -75,11 +76,13 @@ month_with_most_vulnerabilities = {
7576
}
7677
}
7778

79+
# Generate severity distribution for each year
7880
severity_distribution = {
7981
"2023": df_2023['CVSS_Severity'].value_counts().to_dict(),
8082
"2024": df_2024['CVSS_Severity'].value_counts().to_dict()
8183
}
8284

85+
# Final JSON structure
8386
overview_metrics = {
8487
"metadata": {
8588
"description": "Overview metrics summarizing vulnerability data for 2023 and 2024.",
@@ -98,11 +101,10 @@ overview_metrics = {
98101
}
99102
}
100103

101-
# Save the overview_metrics to a JSON file
102-
with open("../../data/2024_insights/output/overview_metrics.json", "w") as f:
104+
# Save the overview metrics to a JSON file
105+
output_path = "../../data/2024_insights/output/overview_metrics.json"
106+
with open(output_path, "w") as f:
103107
json.dump(overview_metrics, f)
104-
105-
overview_metrics
106108
```
107109

108110
## Time-Series Metrics
@@ -185,8 +187,6 @@ time_series_metrics = {
185187
# Save the time_series_metrics to a JSON file
186188
with open("../../data/2024_insights/output/time_series_metrics.json", "w") as f:
187189
json.dump(time_series_metrics, f)
188-
189-
time_series_metrics
190190
```
191191

192192
## Vendor/Product Analysis
@@ -344,8 +344,6 @@ vendor_product = {
344344
# Save the vendor_product metrics to a JSON file
345345
with open("../../data/2024_insights/output/vendor_product_analysis.json", "w") as f:
346346
json.dump(vendor_product, f)
347-
348-
vendor_product
349347
```
350348

351349
## CISA KEV Analysis
@@ -363,10 +361,23 @@ The Known Exploited Vulnerabilities (KEV) catalog provides critical insights for
363361
kev_records_2023 = df_2023[df_2023['CISA_KEV'] == True].copy()
364362
kev_records_2024 = df_2024[df_2024['CISA_KEV'] == True].copy()
365363

366-
# Ensure datetime columns are in the correct format and remove timezone info
367-
for df in [kev_records_2023, kev_records_2024]:
368-
df['KEV_DateAdded'] = pd.to_datetime(df['KEV_DateAdded'], errors='coerce').dt.tz_localize(None)
369-
df['Published_Date'] = pd.to_datetime(df['Published_Date'], errors='coerce').dt.tz_localize(None)
364+
365+
# Function to ensure proper datetime conversion
366+
def ensure_datetime_conversion(df, column_name):
367+
df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
368+
if not pd.api.types.is_datetime64_any_dtype(df[column_name]):
369+
raise ValueError(f"Column {column_name} could not be converted to datetime64[ns]!")
370+
371+
372+
# Ensure datetime for all relevant columns
373+
for df in [df_2023, df_2024, kev_records_2023, kev_records_2024]:
374+
ensure_datetime_conversion(df, 'Published_Date')
375+
ensure_datetime_conversion(df, 'KEV_DateAdded')
376+
377+
# Remove timezone info
378+
for df in [df_2023, df_2024, kev_records_2023, kev_records_2024]:
379+
df['Published_Date'] = df['Published_Date'].dt.tz_localize(None)
380+
df['KEV_DateAdded'] = df['KEV_DateAdded'].dt.tz_localize(None)
370381

371382
# Group CISA KEV Data by Month for 2023 and 2024
372383
kev_additions = {
@@ -391,7 +402,7 @@ kev_monthly_changes = {
391402

392403
# Calculate NVD-KEV Overlap Percentage
393404
nvd_cve_counts = {
394-
year: df.groupby('Published_Month').size().reindex(range(1, 13), fill_value=0)
405+
year: df.groupby(df['Published_Date'].dt.month).size().reindex(range(1, 13), fill_value=0)
395406
for year, df in {"2023": df_2023, "2024": df_2024}.items()
396407
}
397408

@@ -404,53 +415,33 @@ kev_overlap = {
404415
for year in ["2023", "2024"]
405416
}
406417

407-
# Top Vendors in KEV Catalog for 2023 and 2024
418+
# Top Vendors in KEV Catalog
408419
top_kev_vendors = {
409420
year: (
410-
records[records['KEV_DateAdded'].dt.year == int(year)]
411-
.groupby('KEV_Vendor')
412-
.size()
413-
.sort_values(ascending=False)
414-
.head(10)
415-
.reset_index(name='kev_count')
416-
.to_dict(orient='records')
421+
records.groupby('KEV_Vendor').size()
422+
.sort_values(ascending=False).head(10)
423+
.reset_index(name='kev_count').to_dict(orient='records')
417424
)
418425
for year, records in {"2023": kev_records_2023, "2024": kev_records_2024}.items()
419426
}
420427

421-
# Vendor Ranking Changes (Prioritize 2024 top vendors and compare with 2023)
428+
# Vendor Ranking Changes
422429
vendor_rank_changes = []
423-
for vendor in {v['KEV_Vendor'] for v in top_kev_vendors['2024']}:
430+
all_vendors = {v['KEV_Vendor'] for v in top_kev_vendors['2024']}
431+
for vendor in all_vendors:
424432
rank_2023 = next((i + 1 for i, v in enumerate(top_kev_vendors['2023']) if v['KEV_Vendor'] == vendor), None)
425-
rank_2024 = next((i + 1 for i, v in enumerate(top_kev_vendors['2024']) if v['KEV_Vendor'] == vendor), None)
426-
vendor_rank_changes.append({
427-
"vendor": vendor,
428-
"2023_rank": rank_2023,
429-
"2024_rank": rank_2024
430-
})
431-
432-
# Sort the vendor_rank_changes by 2024 rank
433-
vendor_rank_changes = sorted(vendor_rank_changes, key=lambda x: x["2024_rank"] if x["2024_rank"] else float('inf'))
434-
435-
# Time to KEV Inclusion Metrics for 2023 and 2024
436-
time_to_kev_inclusion = {}
437-
for year, records in {"2023": kev_records_2023, "2024": kev_records_2024}.items():
438-
inclusion_times = records[records['KEV_DateAdded'].dt.year == int(year)].copy()
439-
inclusion_times['Time_To_KEV'] = (
440-
inclusion_times['KEV_DateAdded'] - inclusion_times['Published_Date']
441-
).dt.days
442-
inclusion_times = inclusion_times[inclusion_times['Time_To_KEV'] >= 0]
443-
time_to_kev_inclusion[year] = {
444-
"min_days": int(inclusion_times['Time_To_KEV'].min()) if not inclusion_times.empty else None,
445-
"max_days": int(inclusion_times['Time_To_KEV'].max()) if not inclusion_times.empty else None,
446-
"average_days": round(float(inclusion_times['Time_To_KEV'].mean()), 2) if not inclusion_times.empty else None
447-
}
448-
449-
# Final JSON Structure
433+
rank_2024 = next((i + 1 for i, v in enumerate(top_kev_vendors['2024']) if v['KEV_Vendor'] == vendor), None)
434+
vendor_rank_changes.append({
435+
"vendor": vendor,
436+
"2023_rank": rank_2023 or "Not Ranked",
437+
"2024_rank": rank_2024 or "Not Ranked"
438+
})
439+
440+
# Prepare JSON Data
450441
cisa_kev = {
451442
"metadata": {
452443
"description": "Analysis of CISA KEV catalog inclusion and overlap with NVD vulnerabilities for 2023 and 2024.",
453-
"generated_on": generated_date,
444+
"generated_on": "2025-01-04", # Example date, replace with actual
454445
"source": ["NVD", "CISA KEV"],
455446
"attribution": {
456447
"NVD": "This product uses the NVD API but is not endorsed or certified by the NVD.",
@@ -466,16 +457,14 @@ cisa_kev = {
466457
"note": "Percentage of NVD vulnerabilities in a month that were also added to KEV."
467458
},
468459
"top_kev_vendors": top_kev_vendors,
469-
"vendor_rank_changes": vendor_rank_changes,
470-
"time_to_kev_inclusion": time_to_kev_inclusion
460+
"vendor_rank_changes": vendor_rank_changes
471461
}
472462
}
473463

474-
# Save the cisa_kev metrics to a JSON file
475-
with open("../../data/2024_insights/output/cisa_kev_analysis.json", "w") as f:
464+
# Save JSON to File
465+
output_path = "../../data/2024_insights/output/cisa_kev_analysis.json"
466+
with open(output_path, "w") as f:
476467
json.dump(cisa_kev, f)
477-
478-
cisa_kev
479468
```
480469

481470
## Specific CVE Details
@@ -486,10 +475,16 @@ This section highlights vulnerabilities with high impact or severity to assist i
486475
2. **Most Impactful Vulnerabilities**: Combines multiple factors (CVSS, KEV inclusion, exploitation evidence) to rank vulnerabilities.
487476

488477
```python
478+
import pandas as pd
479+
import json
480+
481+
# Set the generated date for metadata
482+
generated_date = "2024-12-30"
483+
489484
# Filter for all CVEs with CVSS_Base_Score of 10.0
490-
cvss_10_cves = df_2024[df_2024['CVSS_Base_Score'] == 10.0]
485+
cvss_10_cves = df_2024[df_2024['CVSS_Base_Score'] == 10.0].copy()
491486

492-
# Additional CVEs to include if fewer than 25
487+
# Determine if additional CVEs are needed to make up a total of 25
493488
remaining_cves_needed = 25 - len(cvss_10_cves)
494489
if remaining_cves_needed > 0:
495490
additional_cves = (
@@ -503,40 +498,38 @@ if remaining_cves_needed > 0:
503498
else:
504499
most_severe = cvss_10_cves
505500

506-
# Ensure the final result is sorted and unique
501+
# Sort and format most severe CVEs
507502
most_severe = (
508503
most_severe.sort_values(by=['CVSS_Base_Score', 'CVE_ID'], ascending=[False, True])
509504
[['CVE_ID', 'Description', 'CVSS_Base_Score', 'Vendor', 'Product']]
510505
.drop_duplicates()
511506
.to_dict(orient='records')
512507
)
513508

514-
# Add exploitation evidence if not already present
509+
# Add exploitation evidence to the dataset
515510
df_2024['Exploitation_Evidence'] = df_2024['CVE_ID'].isin(kev_records_2024['CVE_ID'])
516511

517512

518-
# Define Impact Score calculation function
519-
520-
513+
# Define a function to calculate the Impact Score
521514
def calculate_impact_score(row):
522515
exploitation_weight = 10 if row['Exploitation_Evidence'] else 0
523516
impact_score = row['CVSS_Base_Score'] * 2 + exploitation_weight
524517
return round(impact_score, 2)
525518

526519

527-
# Calculate Impact Scores
520+
# Calculate Impact Scores for all CVEs
528521
df_2024['Impact_Score'] = df_2024.apply(calculate_impact_score, axis=1)
529522

530-
# Filter for most impactful CVEs
523+
# Filter for the most impactful CVEs
531524
most_impactful = (
532525
df_2024.sort_values(by=['Impact_Score', 'CVE_ID'], ascending=[False, True])
533526
[['CVE_ID', 'Impact_Score', 'Exploitation_Evidence', 'Vendor', 'Product']]
534527
.drop_duplicates()
535-
.head(10) # Top 10 CVEs
528+
.head(10) # Select the top 10 CVEs
536529
.to_dict(orient='records')
537530
)
538531

539-
# Final JSON structure
532+
# Construct the final JSON structure
540533
specific_cve_details = {
541534
"metadata": {
542535
"description": "Detailed analysis of the most severe and impactful CVEs for 2024.",
@@ -558,11 +551,10 @@ specific_cve_details = {
558551
}
559552
}
560553

561-
# Save the specific_cve_details metrics to a JSON file
562-
with open("../../data/2024_insights/output/cve_details.json", "w") as f:
554+
# Save the specific CVE details to a JSON file
555+
output_path = "../../data/2024_insights/output/cve_details.json"
556+
with open(output_path, "w") as f:
563557
json.dump(specific_cve_details, f)
564-
565-
specific_cve_details
566558
```
567559

568560
## CVE Assigner Analysis
@@ -575,7 +567,6 @@ This section identifies the organizations assigning the most CVEs to understand
575567
```python
576568
# Group by CVE Assigner for 2023 and 2024 with severity breakdown and total counts
577569

578-
579570
def get_top_assigners_with_totals(df, year):
580571
assigner_data = (
581572
df.groupby('CVE_Assigner')
@@ -651,6 +642,4 @@ top_assigners = {
651642
# Save the top_assigners metrics to a JSON file
652643
with open("../../data/2024_insights/output/top_assigners.json", "w") as f:
653644
json.dump(top_assigners, f)
654-
655-
top_assigners
656645
```

0 commit comments

Comments
 (0)