|
| 1 | +--- |
| 2 | +jupyter: |
| 3 | + jupytext: |
| 4 | + text_representation: |
| 5 | + extension: .md |
| 6 | + format_name: markdown |
| 7 | + format_version: '1.3' |
| 8 | + jupytext_version: 1.16.6 |
| 9 | + kernelspec: |
| 10 | + display_name: Python 3 |
| 11 | + language: python |
| 12 | + name: python3 |
| 13 | +--- |
| 14 | + |
| 15 | +# CVE Data Stories: CWE Trends - Data Analysis |
| 16 | + |
| 17 | +```python |
| 18 | +import matplotlib.pyplot as plt |
| 19 | +import pandas as pd |
| 20 | +import seaborn as sns |
| 21 | +``` |
| 22 | + |
| 23 | +## Preparing the Top 10 CWE Dataset (1999–2024) |
| 24 | + |
| 25 | +This dataset focuses on the **Top 10 CWEs** based on cumulative counts up to 2024, providing a clear view of the most prevalent vulnerabilities over time. The preparation process includes: |
| 26 | + |
| 27 | +- **Data Filtering**: |
| 28 | + - Excluded `NVD-CWE-noinfo` and `NVD-CWE-Other` for cleaner analysis. |
| 29 | + - Focused on data between **1999 and 2024**, explicitly excluding 2025. |
| 30 | +- **Top 10 CWEs Selection**: Identified CWEs with the highest cumulative counts in 2024. |
| 31 | +- **Streamlined Dataset**: Retained only relevant entries for the Top 10 CWEs across the years. |
| 32 | + |
| 33 | +This refined dataset is saved for further analysis, enabling impactful visualizations and insights into long-term CWE trends. |
| 34 | + |
| 35 | + |
| 36 | +```python |
| 37 | +# Load the dataset |
| 38 | +data = pd.read_csv("../../../data/cve_data_stories/cwe_trends/processed/cwe_yearly_cumulative.csv") |
| 39 | + |
| 40 | +# Filter out `NVD-CWE-noinfo` and `NVD-CWE-Other` CWEs |
| 41 | +data = data[~data["CWE_ID"].str.startswith("NVD-CWE")] |
| 42 | + |
| 43 | +# Filter years after 1999 and before 2025 |
| 44 | +data = data[(data["Year"] >= 1999) & (data["Year"] < 2025)] |
| 45 | + |
| 46 | +# Filter for the top 10 CWEs by cumulative count in 2024 |
| 47 | +top_cwes_2024 = data[data["Year"] == 2024].sort_values("Cumulative_Count", ascending=False).head(10) |
| 48 | +top_cwes_ids = top_cwes_2024["CWE_ID"].tolist() |
| 49 | + |
| 50 | +# Filter dataset for only the top 10 CWEs and exclude 2025 explicitly |
| 51 | +filtered_data = data[(data["CWE_ID"].isin(top_cwes_ids)) & (data["Year"] < 2025)].copy() |
| 52 | + |
| 53 | +# Save the final dataset |
| 54 | +filtered_data.to_csv("../../../data/cve_data_stories/cwe_trends/processed/top_10_cwe_yearly_cumulative.csv", |
| 55 | + index=False) |
| 56 | +``` |
| 57 | + |
| 58 | +## Top 10 CWE Rank Trends (1999–2024) |
| 59 | + |
| 60 | +This plot visualizes the **Top 10 CWEs** by rank over time, highlighting their evolution from 1999 to 2024. Each CWE’s line is color-coded, with key features to enhance clarity and impact: |
| 61 | + |
| 62 | +- **End Markers & Labels**: Final ranks in 2024 are highlighted with markers and labeled directly for easy interpretation. |
| 63 | +- **Inverted Y-Axis**: Rank 1 is at the top, emphasizing higher frequency. |
| 64 | +- **Highlighted Years**: Dashed vertical lines mark notable years (2007, 2017, 2018, 2022, 2024). |
| 65 | +- **Readable Design**: A vibrant color palette, clear gridlines, and padding ensure visual appeal and clarity for sharing. |
| 66 | + |
| 67 | +```python |
| 68 | +# Step 1: Calculate ranks for each year |
| 69 | +filtered_data["Rank"] = ( |
| 70 | + filtered_data.groupby("Year")["Cumulative_Count"] |
| 71 | + .rank(method="dense", ascending=False) |
| 72 | + .astype(int) |
| 73 | +) |
| 74 | + |
| 75 | +# Sort CWEs by their final rank (2024) |
| 76 | +final_ranks = ( |
| 77 | + filtered_data[filtered_data["Year"] == 2024] |
| 78 | + .sort_values("Rank")[["CWE_ID", "Rank"]] |
| 79 | + .set_index("CWE_ID") |
| 80 | +) |
| 81 | +filtered_data["Final_Rank"] = filtered_data["CWE_ID"].map(final_ranks["Rank"]) |
| 82 | + |
| 83 | +# Step 2: Sort data by final rank |
| 84 | +filtered_data = filtered_data.sort_values(["Final_Rank", "Year"]) |
| 85 | + |
| 86 | +# Step 3: Plotting the rank trends |
| 87 | +plt.figure(figsize=(16, 10)) # Larger figure size for better readability |
| 88 | +sns.set_style("whitegrid") |
| 89 | + |
| 90 | +# Use a vibrant color palette |
| 91 | +palette = sns.color_palette("husl", len(filtered_data["CWE_ID"].unique())) |
| 92 | + |
| 93 | +# Plot each CWE line with markers |
| 94 | +for i, (cwe_id, cwe_data) in enumerate(filtered_data.groupby("CWE_ID")): |
| 95 | + plt.plot( |
| 96 | + cwe_data["Year"], |
| 97 | + cwe_data["Rank"], |
| 98 | + color=palette[i], |
| 99 | + label=cwe_id, |
| 100 | + linewidth=5.5, |
| 101 | + alpha=0.9, |
| 102 | + ) |
| 103 | + # Add markers at the end of each line |
| 104 | + plt.scatter( |
| 105 | + cwe_data["Year"].iloc[-1], # Last year |
| 106 | + cwe_data["Rank"].iloc[-1], # Last rank |
| 107 | + color=palette[i], |
| 108 | + edgecolor="black", |
| 109 | + s=100, # Marker size |
| 110 | + zorder=5, |
| 111 | + ) |
| 112 | + # Add right-side labels with additional spacing |
| 113 | + plt.text( |
| 114 | + cwe_data["Year"].iloc[-1] + 0.25, # Offset for label spacing |
| 115 | + cwe_data["Rank"].iloc[-1], |
| 116 | + cwe_id, |
| 117 | + fontsize=12, |
| 118 | + weight="bold", |
| 119 | + color=palette[i], |
| 120 | + verticalalignment="center", |
| 121 | + ) |
| 122 | + |
| 123 | +# Invert y-axis to show rank 1 at top |
| 124 | +plt.gca().invert_yaxis() |
| 125 | + |
| 126 | +# TITLES: Main title + optional subtitle for clarity |
| 127 | +plt.title("Top 10 CWE Rank Trends Over Time\n(1999–2024)", fontsize=26, weight="bold", pad=20) |
| 128 | + |
| 129 | +# Axis labels and ticks |
| 130 | +plt.xticks(ticks=range(1999, 2025), fontsize=12) |
| 131 | +plt.yticks(range(1, 11), fontsize=12) # showing ranks 1 to 10 |
| 132 | + |
| 133 | +# Adjust x-axis limits to provide padding for dots and labels |
| 134 | +plt.xlim(1999, 2025) |
| 135 | + |
| 136 | +# Remove legend since lines are labeled directly on the right |
| 137 | +plt.legend([], [], frameon=False) |
| 138 | + |
| 139 | +# Gridlines |
| 140 | +plt.grid(visible=True, linestyle="--", linewidth=0.5, alpha=0.7) |
| 141 | + |
| 142 | +# Highlight specific years with vertical lines |
| 143 | +highlight_years = [2007, 2017, 2018, 2022, 2024] |
| 144 | +for year in highlight_years: |
| 145 | + plt.axvline(x=year, color="gray", linestyle="--", linewidth=1, alpha=0.4) |
| 146 | + |
| 147 | +plt.tight_layout() |
| 148 | +plt.savefig("../../../data/cve_data_stories/cwe_trends/processed/top_25_cwe_rank_trends.png", dpi=300, |
| 149 | + bbox_inches="tight") |
| 150 | +plt.show() |
| 151 | +``` |
0 commit comments