Skip to content

Commit 71e1f86

Browse files
committed
feat: Analyze CWE rank trends (1999–2024)
- Added detailed analysis of the top 10 CWEs by cumulative rank over time. - Visualized CWE rank trends with line plots, markers, and year highlights. - Extracted key insights on rising, falling, and persistent CWEs. - Refined dataset filtering for accuracy, excluding non-informative CWEs.
1 parent 6492afe commit 71e1f86

File tree

5 files changed

+794
-9
lines changed

5 files changed

+794
-9
lines changed
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
---
2+
jupyter:
3+
jupytext:
4+
text_representation:
5+
extension: .md
6+
format_name: markdown
7+
format_version: '1.3'
8+
jupytext_version: 1.16.6
9+
kernelspec:
10+
display_name: Python 3
11+
language: python
12+
name: python3
13+
---
14+
15+
# CVE Data Stories: CWE Trends - Data Processing
16+
17+
18+
```python
19+
import csv
20+
import json
21+
from collections import defaultdict
22+
from datetime import datetime
23+
from pathlib import Path
24+
25+
import pandas as pd
26+
```
27+
28+
# Paths Setup and Data Directories
29+
30+
We start by defining the paths for the raw CVE datasets and setting up the target directory for storing processed data. This includes creating a dictionary of dataset file names for each year and ensuring the target directory exists for saving outputs.
31+
32+
```python
33+
# Paths
34+
DATASETS = {year: f"nvdcve-1.1-{year}.json" for year in range(2002, 2025)}
35+
data_folder = Path("../../../data/cve_data_stories/raw")
36+
37+
# Target directory for processed data
38+
DATA_DIR = Path("../../../data/cve_data_stories/cwe_trends/processed")
39+
DATA_DIR.mkdir(parents=True, exist_ok=True)
40+
41+
output_csv_yearly = DATA_DIR / "cwe_yearly_counts.csv"
42+
output_csv_cumulative = DATA_DIR / "cwe_yearly_cumulative.csv"
43+
```
44+
45+
# Collecting CWE Yearly Counts
46+
47+
This section processes the raw JSON datasets to extract CWE IDs and their associated publication years.
48+
49+
The key steps include:
50+
1. Reading the JSON files.
51+
2. Extracting CWE IDs and publication years from each CVE item.
52+
3. Counting occurrences of each CWE ID by year.
53+
54+
The resulting yearly counts are stored in a dictionary for further processing.
55+
56+
```python
57+
def collect_cwe_yearly_counts(json_file, year_counts):
58+
try:
59+
with open(json_file, 'r') as f:
60+
data = json.load(f)
61+
62+
for item in data.get('CVE_Items', []):
63+
published_date = item.get('publishedDate', None)
64+
65+
# Parse year from the published date
66+
if published_date:
67+
pub_year = datetime.strptime(published_date, "%Y-%m-%dT%H:%MZ").year
68+
else:
69+
continue # Skip if no published date
70+
71+
# Extract CWE IDs
72+
cwe_ids = item.get('cve', {}).get('problemtype', {}).get('problemtype_data', [])
73+
for cwe_entry in cwe_ids:
74+
for desc in cwe_entry.get('description', []):
75+
cwe = desc.get('value', '') # Get CWE ID (e.g., CWE-79)
76+
if cwe:
77+
year_counts[(cwe, pub_year)] += 1
78+
79+
except FileNotFoundError:
80+
print(f"File not found: {json_file}")
81+
except json.JSONDecodeError:
82+
print(f"Error decoding JSON: {json_file}")
83+
except Exception as e:
84+
print(f"An error occurred: {e}")
85+
86+
87+
# Initialize defaultdict to hold CWE yearly counts
88+
cwe_yearly_counts = defaultdict(int)
89+
90+
# Process each dataset
91+
for year, file_name in DATASETS.items():
92+
input_file = data_folder / file_name
93+
print(f"Processing {input_file}")
94+
collect_cwe_yearly_counts(input_file, cwe_yearly_counts)
95+
96+
# Write CWE yearly counts to a CSV
97+
with open(output_csv_yearly, 'w', newline='') as csvfile:
98+
writer = csv.writer(csvfile)
99+
writer.writerow(["CWE_ID", "Year", "Count"]) # Header row
100+
for (cwe_id, year), count in sorted(cwe_yearly_counts.items()):
101+
writer.writerow([cwe_id, year, count])
102+
103+
print(f"Yearly CWE counts written to {output_csv_yearly}")
104+
```
105+
106+
107+
108+
109+
# Preparing Yearly and Cumulative Counts
110+
111+
The yearly counts are loaded and preprocessed to ensure continuity in the timeline for each CWE ID. Missing years are filled with zero counts, and cumulative counts are calculated for each CWE over time.
112+
113+
The final dataset includes:
114+
1. CWE ID
115+
2. Year
116+
3. Yearly Count
117+
4. Cumulative Count
118+
119+
The processed data is saved to a CSV file for further analysis and visualization.
120+
121+
```python
122+
# Load the yearly counts CSV
123+
df = pd.read_csv(output_csv_yearly)
124+
125+
# Generate all years for each CWE
126+
cwes = df["CWE_ID"].unique()
127+
years = list(range(df["Year"].min(), df["Year"].max() + 1))
128+
129+
# Create a complete index for CWEs and years
130+
full_index = pd.MultiIndex.from_product([cwes, years], names=["CWE_ID", "Year"])
131+
df_full = pd.DataFrame(index=full_index).reset_index()
132+
133+
# Merge with original data, filling missing counts with 0
134+
df = pd.merge(df_full, df, on=["CWE_ID", "Year"], how="left").fillna({"Count": 0})
135+
136+
# Sort by CWE ID and year
137+
df = df.sort_values(by=["CWE_ID", "Year"])
138+
139+
# Calculate cumulative counts
140+
df["Cumulative_Count"] = df.groupby("CWE_ID")["Count"].cumsum().astype(int)
141+
142+
# Save the final dataset
143+
df.to_csv(output_csv_cumulative, index=False)
144+
145+
print(f"Cumulative counts saved to {output_csv_cumulative}")
146+
147+
```
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
---
2+
jupyter:
3+
jupytext:
4+
text_representation:
5+
extension: .md
6+
format_name: markdown
7+
format_version: '1.3'
8+
jupytext_version: 1.16.6
9+
kernelspec:
10+
display_name: Python 3
11+
language: python
12+
name: python3
13+
---
14+
15+
# CVE Data Stories: CWE Trends - Data Analysis
16+
17+
```python
18+
import matplotlib.pyplot as plt
19+
import pandas as pd
20+
import seaborn as sns
21+
```
22+
23+
## Preparing the Top 10 CWE Dataset (1999–2024)
24+
25+
This dataset focuses on the **Top 10 CWEs** based on cumulative counts up to 2024, providing a clear view of the most prevalent vulnerabilities over time. The preparation process includes:
26+
27+
- **Data Filtering**:
28+
- Excluded `NVD-CWE-noinfo` and `NVD-CWE-Other` for cleaner analysis.
29+
- Focused on data between **1999 and 2024**, explicitly excluding 2025.
30+
- **Top 10 CWEs Selection**: Identified CWEs with the highest cumulative counts in 2024.
31+
- **Streamlined Dataset**: Retained only relevant entries for the Top 10 CWEs across the years.
32+
33+
This refined dataset is saved for further analysis, enabling impactful visualizations and insights into long-term CWE trends.
34+
35+
36+
```python
37+
# Load the dataset
38+
data = pd.read_csv("../../../data/cve_data_stories/cwe_trends/processed/cwe_yearly_cumulative.csv")
39+
40+
# Filter out `NVD-CWE-noinfo` and `NVD-CWE-Other` CWEs
41+
data = data[~data["CWE_ID"].str.startswith("NVD-CWE")]
42+
43+
# Filter years after 1999 and before 2025
44+
data = data[(data["Year"] >= 1999) & (data["Year"] < 2025)]
45+
46+
# Filter for the top 10 CWEs by cumulative count in 2024
47+
top_cwes_2024 = data[data["Year"] == 2024].sort_values("Cumulative_Count", ascending=False).head(10)
48+
top_cwes_ids = top_cwes_2024["CWE_ID"].tolist()
49+
50+
# Filter dataset for only the top 10 CWEs and exclude 2025 explicitly
51+
filtered_data = data[(data["CWE_ID"].isin(top_cwes_ids)) & (data["Year"] < 2025)].copy()
52+
53+
# Save the final dataset
54+
filtered_data.to_csv("../../../data/cve_data_stories/cwe_trends/processed/top_10_cwe_yearly_cumulative.csv",
55+
index=False)
56+
```
57+
58+
## Top 10 CWE Rank Trends (1999–2024)
59+
60+
This plot visualizes the **Top 10 CWEs** by rank over time, highlighting their evolution from 1999 to 2024. Each CWE’s line is color-coded, with key features to enhance clarity and impact:
61+
62+
- **End Markers & Labels**: Final ranks in 2024 are highlighted with markers and labeled directly for easy interpretation.
63+
- **Inverted Y-Axis**: Rank 1 is at the top, emphasizing higher frequency.
64+
- **Highlighted Years**: Dashed vertical lines mark notable years (2007, 2017, 2018, 2022, 2024).
65+
- **Readable Design**: A vibrant color palette, clear gridlines, and padding ensure visual appeal and clarity for sharing.
66+
67+
```python
68+
# Step 1: Calculate ranks for each year
69+
filtered_data["Rank"] = (
70+
filtered_data.groupby("Year")["Cumulative_Count"]
71+
.rank(method="dense", ascending=False)
72+
.astype(int)
73+
)
74+
75+
# Sort CWEs by their final rank (2024)
76+
final_ranks = (
77+
filtered_data[filtered_data["Year"] == 2024]
78+
.sort_values("Rank")[["CWE_ID", "Rank"]]
79+
.set_index("CWE_ID")
80+
)
81+
filtered_data["Final_Rank"] = filtered_data["CWE_ID"].map(final_ranks["Rank"])
82+
83+
# Step 2: Sort data by final rank
84+
filtered_data = filtered_data.sort_values(["Final_Rank", "Year"])
85+
86+
# Step 3: Plotting the rank trends
87+
plt.figure(figsize=(16, 10)) # Larger figure size for better readability
88+
sns.set_style("whitegrid")
89+
90+
# Use a vibrant color palette
91+
palette = sns.color_palette("husl", len(filtered_data["CWE_ID"].unique()))
92+
93+
# Plot each CWE line with markers
94+
for i, (cwe_id, cwe_data) in enumerate(filtered_data.groupby("CWE_ID")):
95+
plt.plot(
96+
cwe_data["Year"],
97+
cwe_data["Rank"],
98+
color=palette[i],
99+
label=cwe_id,
100+
linewidth=5.5,
101+
alpha=0.9,
102+
)
103+
# Add markers at the end of each line
104+
plt.scatter(
105+
cwe_data["Year"].iloc[-1], # Last year
106+
cwe_data["Rank"].iloc[-1], # Last rank
107+
color=palette[i],
108+
edgecolor="black",
109+
s=100, # Marker size
110+
zorder=5,
111+
)
112+
# Add right-side labels with additional spacing
113+
plt.text(
114+
cwe_data["Year"].iloc[-1] + 0.25, # Offset for label spacing
115+
cwe_data["Rank"].iloc[-1],
116+
cwe_id,
117+
fontsize=12,
118+
weight="bold",
119+
color=palette[i],
120+
verticalalignment="center",
121+
)
122+
123+
# Invert y-axis to show rank 1 at top
124+
plt.gca().invert_yaxis()
125+
126+
# TITLES: Main title + optional subtitle for clarity
127+
plt.title("Top 10 CWE Rank Trends Over Time\n(1999–2024)", fontsize=26, weight="bold", pad=20)
128+
129+
# Axis labels and ticks
130+
plt.xticks(ticks=range(1999, 2025), fontsize=12)
131+
plt.yticks(range(1, 11), fontsize=12) # showing ranks 1 to 10
132+
133+
# Adjust x-axis limits to provide padding for dots and labels
134+
plt.xlim(1999, 2025)
135+
136+
# Remove legend since lines are labeled directly on the right
137+
plt.legend([], [], frameon=False)
138+
139+
# Gridlines
140+
plt.grid(visible=True, linestyle="--", linewidth=0.5, alpha=0.7)
141+
142+
# Highlight specific years with vertical lines
143+
highlight_years = [2007, 2017, 2018, 2022, 2024]
144+
for year in highlight_years:
145+
plt.axvline(x=year, color="gray", linestyle="--", linewidth=1, alpha=0.4)
146+
147+
plt.tight_layout()
148+
plt.savefig("../../../data/cve_data_stories/cwe_trends/processed/top_25_cwe_rank_trends.png", dpi=300,
149+
bbox_inches="tight")
150+
plt.show()
151+
```

notebooks/cve_data_stories/01_data_collection.ipynb

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
{
1010
"metadata": {
1111
"ExecuteTime": {
12-
"end_time": "2025-01-04T17:52:27.003359Z",
13-
"start_time": "2025-01-04T17:52:26.999031Z"
12+
"end_time": "2025-01-11T10:58:00.353617Z",
13+
"start_time": "2025-01-11T10:58:00.217081Z"
1414
}
1515
},
1616
"cell_type": "code",
@@ -22,7 +22,7 @@
2222
],
2323
"id": "f0ea410ba01c8838",
2424
"outputs": [],
25-
"execution_count": 5
25+
"execution_count": 1
2626
},
2727
{
2828
"metadata": {},
@@ -39,8 +39,8 @@
3939
{
4040
"metadata": {
4141
"ExecuteTime": {
42-
"end_time": "2025-01-04T17:52:27.026069Z",
43-
"start_time": "2025-01-04T17:52:27.022020Z"
42+
"end_time": "2025-01-11T10:58:00.363251Z",
43+
"start_time": "2025-01-11T10:58:00.359895Z"
4444
}
4545
},
4646
"cell_type": "code",
@@ -51,7 +51,7 @@
5151
],
5252
"id": "99e5bc4542e6d1d7",
5353
"outputs": [],
54-
"execution_count": 6
54+
"execution_count": 2
5555
},
5656
{
5757
"cell_type": "markdown",
@@ -73,8 +73,8 @@
7373
{
7474
"metadata": {
7575
"ExecuteTime": {
76-
"end_time": "2025-01-04T17:52:37.079374Z",
77-
"start_time": "2025-01-04T17:52:27.049230Z"
76+
"end_time": "2025-01-11T10:58:08.702411Z",
77+
"start_time": "2025-01-11T10:58:00.637617Z"
7878
}
7979
},
8080
"cell_type": "code",
@@ -198,7 +198,7 @@
198198
]
199199
}
200200
],
201-
"execution_count": 7
201+
"execution_count": 3
202202
}
203203
],
204204
"metadata": {

0 commit comments

Comments
 (0)