Skip to content

feat(data): Added automated CPU batch updater #841

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@
"\n",
"url = \"https://www.intel.com/content/www/us/en/ark/products/series/236644/5th-gen-intel-xeon-scalable-processors.html\"\n",
"response = requests.get(url)\n",
"soup = BeautifulSoup(response.text, 'html.parser')"
"soup = BeautifulSoup(response.text, \"html.parser\")"
]
},
{
Expand Down Expand Up @@ -454,25 +454,26 @@
],
"source": [
"# Find the table\n",
"table = soup.find('table', id='product-table')\n",
"table = soup.find(\"table\", id=\"product-table\")\n",
"\n",
"# Extract headers\n",
"headers = []\n",
"for th in table.find_all('th'):\n",
" header_text = th.find('div', class_='header-text-space').text.strip()\n",
"for th in table.find_all(\"th\"):\n",
" header_text = th.find(\"div\", class_=\"header-text-space\").text.strip()\n",
" headers.append(header_text)\n",
"\n",
"# Extract rows\n",
"data = []\n",
"for row in table.find('tbody').find_all('tr'):\n",
"for row in table.find(\"tbody\").find_all(\"tr\"):\n",
" row_data = {}\n",
" cells = row.find_all('td')\n",
" cells = row.find_all(\"td\")\n",
" for i, cell in enumerate(cells):\n",
" row_data[headers[i]] = cell.text.strip()\n",
" data.append(row_data)\n",
"\n",
"# Convert to pandas DataFrame for easy viewing/export\n",
"import pandas as pd\n",
"\n",
"df = pd.DataFrame(data)\n",
"print(df)"
]
Expand Down Expand Up @@ -526,7 +527,7 @@
}
],
"source": [
"df['TDP'].replace(\"W\", \"\")"
"df[\"TDP\"].replace(\"W\", \"\")"
]
},
{
Expand Down Expand Up @@ -554,24 +555,28 @@
"\n",
"# Path to your manually exported AMD CPU dataset.\n",
"# (Adjust the file path as needed.)\n",
"amd_csv_path = './AMD_CPU_desktop_laptop.csv'\n",
"amd_csv_path = \"./AMD_CPU_desktop_laptop.csv\"\n",
"\n",
"try:\n",
" amd_df = pd.read_csv(amd_csv_path)\n",
" amd_df = amd_df[amd_df['Launch Date'].str.contains(\"2024|2025\", na=False)]\n",
" amd_df = amd_df[amd_df['Form Factor'].str.contains(\"Desktops\", na=False)]\n",
" \n",
" amd_df = amd_df[amd_df[\"Launch Date\"].str.contains(\"2024|2025\", na=False)]\n",
" amd_df = amd_df[amd_df[\"Form Factor\"].str.contains(\"Desktops\", na=False)]\n",
"\n",
" # Convert columns to numeric, forcing errors to NaN\n",
" # amd_df['Default TDP'] = amd_df['Default TDP'].str.replace('W', '').astype(float)\n",
" amd_df['TDP'] = pd.to_numeric(amd_df['Default TDP'].str.replace('W', ''), errors='coerce')\n",
" amd_df['# of Threads'] = pd.to_numeric(amd_df['# of Threads'], errors='coerce')\n",
" \n",
" amd_df[\"TDP\"] = pd.to_numeric(\n",
" amd_df[\"Default TDP\"].str.replace(\"W\", \"\"), errors=\"coerce\"\n",
" )\n",
" amd_df[\"# of Threads\"] = pd.to_numeric(amd_df[\"# of Threads\"], errors=\"coerce\")\n",
"\n",
" # It is assumed the CSV contains columns named 'TDP' (in Watts) and 'Total Cores'\n",
" # Adjust the column names if they differ.\n",
" amd_df['TDP_per_core'] = amd_df['TDP'] / amd_df['# of Threads']\n",
" \n",
" average_tdp_per_core = amd_df['TDP_per_core'].mean()\n",
" print(\"Average TDP per core from AMD dataset: {:.2f} W\".format(average_tdp_per_core))\n",
" amd_df[\"TDP_per_core\"] = amd_df[\"TDP\"] / amd_df[\"# of Threads\"]\n",
"\n",
" average_tdp_per_core = amd_df[\"TDP_per_core\"].mean()\n",
" print(\n",
" \"Average TDP per core from AMD dataset: {:.2f} W\".format(average_tdp_per_core)\n",
" )\n",
"except Exception as e:\n",
" print(\"Error loading or processing AMD dataset:\", e)"
]
Expand All @@ -595,23 +600,27 @@
"\n",
"# Path to your manually exported AMD CPU dataset.\n",
"# (Adjust the file path as needed.)\n",
"amd_csv_path = './AMD_Server_Processor_Specifications.csv'\n",
"amd_csv_path = \"./AMD_Server_Processor_Specifications.csv\"\n",
"\n",
"try:\n",
" amd_df = pd.read_csv(amd_csv_path)\n",
" amd_df = amd_df[amd_df['Launch Date'].str.contains(\"2024|2025\", na=False)]\n",
" \n",
" amd_df = amd_df[amd_df[\"Launch Date\"].str.contains(\"2024|2025\", na=False)]\n",
"\n",
" # Convert columns to numeric, forcing errors to NaN\n",
" # amd_df['Default TDP'] = amd_df['Default TDP'].str.replace('W', '').astype(float)\n",
" amd_df['TDP'] = pd.to_numeric(amd_df['Default TDP'].str.replace('W', ''), errors='coerce')\n",
" amd_df['# of Threads'] = pd.to_numeric(amd_df['# of Threads'], errors='coerce')\n",
" \n",
" amd_df[\"TDP\"] = pd.to_numeric(\n",
" amd_df[\"Default TDP\"].str.replace(\"W\", \"\"), errors=\"coerce\"\n",
" )\n",
" amd_df[\"# of Threads\"] = pd.to_numeric(amd_df[\"# of Threads\"], errors=\"coerce\")\n",
"\n",
" # It is assumed the CSV contains columns named 'TDP' (in Watts) and 'Total Cores'\n",
" # Adjust the column names if they differ.\n",
" amd_df['TDP_per_core'] = amd_df['TDP'] / amd_df['# of Threads']\n",
" \n",
" average_tdp_per_core = amd_df['TDP_per_core'].mean()\n",
" print(\"Average TDP per core from AMD dataset: {:.2f} W\".format(average_tdp_per_core))\n",
" amd_df[\"TDP_per_core\"] = amd_df[\"TDP\"] / amd_df[\"# of Threads\"]\n",
"\n",
" average_tdp_per_core = amd_df[\"TDP_per_core\"].mean()\n",
" print(\n",
" \"Average TDP per core from AMD dataset: {:.2f} W\".format(average_tdp_per_core)\n",
" )\n",
"except Exception as e:\n",
" print(\"Error loading or processing AMD dataset:\", e)"
]
Expand Down Expand Up @@ -1055,7 +1064,7 @@
}
],
"source": [
"amd_df[['Name', 'TDP', '# of Threads', 'TDP_per_core', 'Launch Date']]"
"amd_df[[\"Name\", \"TDP\", \"# of Threads\", \"TDP_per_core\", \"Launch Date\"]]"
]
},
{
Expand Down Expand Up @@ -1179,7 +1188,7 @@
}
],
"source": [
"df = pd.read_csv('cpu_power.csv')\n",
"df = pd.read_csv(\"cpu_power.csv\")\n",
"df"
]
},
Expand Down Expand Up @@ -1246,12 +1255,14 @@
}
],
"source": [
"amd_csv_path = './AMD_Server_Processor_Specifications.csv'\n",
"amd_csv_path = \"./AMD_Server_Processor_Specifications.csv\"\n",
"amd_df = pd.read_csv(amd_csv_path)\n",
"amd_df['TDP'] = pd.to_numeric(amd_df['Default TDP'].str.replace('W', ''), errors='coerce')\n",
"amd_df['Name'] = amd_df['Name'].str.replace('™', '')\n",
"amd_server = amd_df[['Name', 'TDP' ]]\n",
"amd_server = amd_server.dropna(subset=['TDP'])\n",
"amd_df[\"TDP\"] = pd.to_numeric(\n",
" amd_df[\"Default TDP\"].str.replace(\"W\", \"\"), errors=\"coerce\"\n",
")\n",
"amd_df[\"Name\"] = amd_df[\"Name\"].str.replace(\"™\", \"\")\n",
"amd_server = amd_df[[\"Name\", \"TDP\"]]\n",
"amd_server = amd_server.dropna(subset=[\"TDP\"])\n",
"amd_server.head(3)"
]
},
Expand Down Expand Up @@ -1406,11 +1417,13 @@
],
"source": [
"# Merge df with amd_server, avoiding duplicates\n",
"amd_server['TDP'] = amd_server['TDP'].astype(int).astype(str)\n",
"merged_df = df.merge(amd_server, on='Name', how='outer', suffixes=('', '_AMD'), indicator=True)\n",
"amd_server[\"TDP\"] = amd_server[\"TDP\"].astype(int).astype(str)\n",
"merged_df = df.merge(\n",
" amd_server, on=\"Name\", how=\"outer\", suffixes=(\"\", \"_AMD\"), indicator=True\n",
")\n",
"# Filter for new entries that are only in amd_server\n",
"new_cpus = merged_df[merged_df['_merge'] == 'right_only']\n",
"new_cpus['TDP'] = new_cpus['TDP_AMD']\n",
"new_cpus = merged_df[merged_df[\"_merge\"] == \"right_only\"]\n",
"new_cpus[\"TDP\"] = new_cpus[\"TDP_AMD\"]\n",
"new_cpus"
]
},
Expand Down Expand Up @@ -1528,11 +1541,9 @@
}
],
"source": [
"\n",
"\n",
"# merged_df.query('Name.str.contains(\"EPYC\")')\n",
"new_cpus_to_add = new_cpus.drop(columns=['_merge']).loc[:, df.columns]\n",
"new_cpus_to_add\n"
"new_cpus_to_add = new_cpus.drop(columns=[\"_merge\"]).loc[:, df.columns]\n",
"new_cpus_to_add"
]
},
{
Expand Down Expand Up @@ -1588,7 +1599,7 @@
"source": [
"# Option 2: Append the new CPUs to the original df\n",
"df = pd.concat([df, new_cpus_to_add], ignore_index=True)\n",
"df.sort_values('Name', ascending=True, inplace=True)\n",
"df.sort_values(\"Name\", ascending=True, inplace=True)\n",
"df.query('Name.str.contains(\"AMD EPYC 4124P\")')"
]
},
Expand All @@ -1598,7 +1609,36 @@
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('cpu_power.csv', index=False)"
"df.to_csv(\"cpu_power.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Remove with..."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"# Load the existing dataset\n",
"df = pd.read_csv(\"../cpu_power.csv\")\n",
"# Replace with re.sub(r\" with.*\", \"\", name)\n",
"def clean_cpu_name(name):\n",
" import re\n",
" # Remove \"with\" and everything after it\n",
" name = re.sub(r\" with.*\", \"\", name)\n",
" # Remove \"™\" symbol\n",
" name = name.replace(\"™\", \"\")\n",
" return name.strip()\n",
"df[\"Name\"] = df[\"Name\"].apply(clean_cpu_name)\n",
"# Save the cleaned dataset\n",
"df.to_csv(\"../cpu_power.csv\", index=False)"
]
},
{
Expand All @@ -1609,7 +1649,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "codecarbon",
"display_name": "3.10.5",
"language": "python",
"name": "python3"
},
Expand All @@ -1623,7 +1663,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
"version": "3.10.5"
}
},
"nbformat": 4,
Expand Down
15 changes: 15 additions & 0 deletions codecarbon/data/hardware/cpu_dataset_builder/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# How to update the CPU database

To update the CPU database, you have to run:

```bash
cd codecarbon/data/hardware/cpu_dataset_builder
hatch run pip install playwright beautifulsoup4
hatch run python intel_cpu_scrapper.py
hatch run python amd_cpu_scrapper.py
hatch run python merge_scrapped_cpu_power.py
```

Then commit the changes to the CSV files.

CodeCarbon only use the `cpu_power.csv` file, but we keep the other files for reference and to allow someone else to use them if needed.
Loading
Loading