mlco2 · benoit-cty · May 25, 2025 · May 18, 2025 · May 19, 2025 · May 25, 2025
diff --git a/...on/data/hardware/CPU_Create_Dataset.ipynb → ..._dataset_builder/CPU_Create_Dataset.ipynb b/...on/data/hardware/CPU_Create_Dataset.ipynb → ..._dataset_builder/CPU_Create_Dataset.ipynb
@@ -357,7 +357,7 @@
     "\n",
     "url = \"https://www.intel.com/content/www/us/en/ark/products/series/236644/5th-gen-intel-xeon-scalable-processors.html\"\n",
     "response = requests.get(url)\n",
-    "soup = BeautifulSoup(response.text, 'html.parser')"
+    "soup = BeautifulSoup(response.text, \"html.parser\")"
    ]
   },
   {
@@ -454,25 +454,26 @@
    ],
    "source": [
     "# Find the table\n",
-    "table = soup.find('table', id='product-table')\n",
+    "table = soup.find(\"table\", id=\"product-table\")\n",
     "\n",
     "# Extract headers\n",
     "headers = []\n",
-    "for th in table.find_all('th'):\n",
-    "    header_text = th.find('div', class_='header-text-space').text.strip()\n",
+    "for th in table.find_all(\"th\"):\n",
+    "    header_text = th.find(\"div\", class_=\"header-text-space\").text.strip()\n",
     "    headers.append(header_text)\n",
     "\n",
     "# Extract rows\n",
     "data = []\n",
-    "for row in table.find('tbody').find_all('tr'):\n",
+    "for row in table.find(\"tbody\").find_all(\"tr\"):\n",
     "    row_data = {}\n",
-    "    cells = row.find_all('td')\n",
+    "    cells = row.find_all(\"td\")\n",
     "    for i, cell in enumerate(cells):\n",
     "        row_data[headers[i]] = cell.text.strip()\n",
     "    data.append(row_data)\n",
     "\n",
     "# Convert to pandas DataFrame for easy viewing/export\n",
     "import pandas as pd\n",
+    "\n",
     "df = pd.DataFrame(data)\n",
     "print(df)"
    ]
@@ -526,7 +527,7 @@
     }
    ],
    "source": [
-    "df['TDP'].replace(\"W\", \"\")"
+    "df[\"TDP\"].replace(\"W\", \"\")"
    ]
   },
   {
@@ -554,24 +555,28 @@
     "\n",
     "# Path to your manually exported AMD CPU dataset.\n",
     "# (Adjust the file path as needed.)\n",
-    "amd_csv_path = './AMD_CPU_desktop_laptop.csv'\n",
+    "amd_csv_path = \"./AMD_CPU_desktop_laptop.csv\"\n",
     "\n",
     "try:\n",
     "    amd_df = pd.read_csv(amd_csv_path)\n",
-    "    amd_df = amd_df[amd_df['Launch Date'].str.contains(\"2024|2025\", na=False)]\n",
-    "    amd_df = amd_df[amd_df['Form Factor'].str.contains(\"Desktops\", na=False)]\n",
-    "    \n",
+    "    amd_df = amd_df[amd_df[\"Launch Date\"].str.contains(\"2024|2025\", na=False)]\n",
+    "    amd_df = amd_df[amd_df[\"Form Factor\"].str.contains(\"Desktops\", na=False)]\n",
+    "\n",
     "    # Convert columns to numeric, forcing errors to NaN\n",
     "    # amd_df['Default TDP'] = amd_df['Default TDP'].str.replace('W', '').astype(float)\n",
-    "    amd_df['TDP'] = pd.to_numeric(amd_df['Default TDP'].str.replace('W', ''), errors='coerce')\n",
-    "    amd_df['# of Threads'] = pd.to_numeric(amd_df['# of Threads'], errors='coerce')\n",
-    "    \n",
+    "    amd_df[\"TDP\"] = pd.to_numeric(\n",
+    "        amd_df[\"Default TDP\"].str.replace(\"W\", \"\"), errors=\"coerce\"\n",
+    "    )\n",
+    "    amd_df[\"# of Threads\"] = pd.to_numeric(amd_df[\"# of Threads\"], errors=\"coerce\")\n",
+    "\n",
     "    # It is assumed the CSV contains columns named 'TDP' (in Watts) and 'Total Cores'\n",
     "    # Adjust the column names if they differ.\n",
-    "    amd_df['TDP_per_core'] = amd_df['TDP'] / amd_df['# of Threads']\n",
-    "    \n",
-    "    average_tdp_per_core = amd_df['TDP_per_core'].mean()\n",
-    "    print(\"Average TDP per core from AMD dataset: {:.2f} W\".format(average_tdp_per_core))\n",
+    "    amd_df[\"TDP_per_core\"] = amd_df[\"TDP\"] / amd_df[\"# of Threads\"]\n",
+    "\n",
+    "    average_tdp_per_core = amd_df[\"TDP_per_core\"].mean()\n",
+    "    print(\n",
+    "        \"Average TDP per core from AMD dataset: {:.2f} W\".format(average_tdp_per_core)\n",
+    "    )\n",
     "except Exception as e:\n",
     "    print(\"Error loading or processing AMD dataset:\", e)"
    ]
@@ -595,23 +600,27 @@
     "\n",
     "# Path to your manually exported AMD CPU dataset.\n",
     "# (Adjust the file path as needed.)\n",
-    "amd_csv_path = './AMD_Server_Processor_Specifications.csv'\n",
+    "amd_csv_path = \"./AMD_Server_Processor_Specifications.csv\"\n",
     "\n",
     "try:\n",
     "    amd_df = pd.read_csv(amd_csv_path)\n",
-    "    amd_df = amd_df[amd_df['Launch Date'].str.contains(\"2024|2025\", na=False)]\n",
-    "    \n",
+    "    amd_df = amd_df[amd_df[\"Launch Date\"].str.contains(\"2024|2025\", na=False)]\n",
+    "\n",
     "    # Convert columns to numeric, forcing errors to NaN\n",
     "    # amd_df['Default TDP'] = amd_df['Default TDP'].str.replace('W', '').astype(float)\n",
-    "    amd_df['TDP'] = pd.to_numeric(amd_df['Default TDP'].str.replace('W', ''), errors='coerce')\n",
-    "    amd_df['# of Threads'] = pd.to_numeric(amd_df['# of Threads'], errors='coerce')\n",
-    "    \n",
+    "    amd_df[\"TDP\"] = pd.to_numeric(\n",
+    "        amd_df[\"Default TDP\"].str.replace(\"W\", \"\"), errors=\"coerce\"\n",
+    "    )\n",
+    "    amd_df[\"# of Threads\"] = pd.to_numeric(amd_df[\"# of Threads\"], errors=\"coerce\")\n",
+    "\n",
     "    # It is assumed the CSV contains columns named 'TDP' (in Watts) and 'Total Cores'\n",
     "    # Adjust the column names if they differ.\n",
-    "    amd_df['TDP_per_core'] = amd_df['TDP'] / amd_df['# of Threads']\n",
-    "    \n",
-    "    average_tdp_per_core = amd_df['TDP_per_core'].mean()\n",
-    "    print(\"Average TDP per core from AMD dataset: {:.2f} W\".format(average_tdp_per_core))\n",
+    "    amd_df[\"TDP_per_core\"] = amd_df[\"TDP\"] / amd_df[\"# of Threads\"]\n",
+    "\n",
+    "    average_tdp_per_core = amd_df[\"TDP_per_core\"].mean()\n",
+    "    print(\n",
+    "        \"Average TDP per core from AMD dataset: {:.2f} W\".format(average_tdp_per_core)\n",
+    "    )\n",
     "except Exception as e:\n",
     "    print(\"Error loading or processing AMD dataset:\", e)"
    ]
@@ -1055,7 +1064,7 @@
     }
    ],
    "source": [
-    "amd_df[['Name', 'TDP', '# of Threads', 'TDP_per_core', 'Launch Date']]"
+    "amd_df[[\"Name\", \"TDP\", \"# of Threads\", \"TDP_per_core\", \"Launch Date\"]]"
    ]
   },
   {
@@ -1179,7 +1188,7 @@
     }
    ],
    "source": [
-    "df = pd.read_csv('cpu_power.csv')\n",
+    "df = pd.read_csv(\"cpu_power.csv\")\n",
     "df"
    ]
   },
@@ -1246,12 +1255,14 @@
     }
    ],
    "source": [
-    "amd_csv_path = './AMD_Server_Processor_Specifications.csv'\n",
+    "amd_csv_path = \"./AMD_Server_Processor_Specifications.csv\"\n",
     "amd_df = pd.read_csv(amd_csv_path)\n",
-    "amd_df['TDP'] = pd.to_numeric(amd_df['Default TDP'].str.replace('W', ''), errors='coerce')\n",
-    "amd_df['Name'] = amd_df['Name'].str.replace('™', '')\n",
-    "amd_server = amd_df[['Name', 'TDP' ]]\n",
-    "amd_server = amd_server.dropna(subset=['TDP'])\n",
+    "amd_df[\"TDP\"] = pd.to_numeric(\n",
+    "    amd_df[\"Default TDP\"].str.replace(\"W\", \"\"), errors=\"coerce\"\n",
+    ")\n",
+    "amd_df[\"Name\"] = amd_df[\"Name\"].str.replace(\"™\", \"\")\n",
+    "amd_server = amd_df[[\"Name\", \"TDP\"]]\n",
+    "amd_server = amd_server.dropna(subset=[\"TDP\"])\n",
     "amd_server.head(3)"
    ]
   },
@@ -1406,11 +1417,13 @@
    ],
    "source": [
     "# Merge df with amd_server, avoiding duplicates\n",
-    "amd_server['TDP'] = amd_server['TDP'].astype(int).astype(str)\n",
-    "merged_df = df.merge(amd_server, on='Name', how='outer', suffixes=('', '_AMD'), indicator=True)\n",
+    "amd_server[\"TDP\"] = amd_server[\"TDP\"].astype(int).astype(str)\n",
+    "merged_df = df.merge(\n",
+    "    amd_server, on=\"Name\", how=\"outer\", suffixes=(\"\", \"_AMD\"), indicator=True\n",
+    ")\n",
     "# Filter for new entries that are only in amd_server\n",
-    "new_cpus = merged_df[merged_df['_merge'] == 'right_only']\n",
-    "new_cpus['TDP'] = new_cpus['TDP_AMD']\n",
+    "new_cpus = merged_df[merged_df[\"_merge\"] == \"right_only\"]\n",
+    "new_cpus[\"TDP\"] = new_cpus[\"TDP_AMD\"]\n",
     "new_cpus"
    ]
   },
@@ -1528,11 +1541,9 @@
     }
    ],
    "source": [
-    "\n",
-    "\n",
     "# merged_df.query('Name.str.contains(\"EPYC\")')\n",
-    "new_cpus_to_add = new_cpus.drop(columns=['_merge']).loc[:, df.columns]\n",
-    "new_cpus_to_add\n"
+    "new_cpus_to_add = new_cpus.drop(columns=[\"_merge\"]).loc[:, df.columns]\n",
+    "new_cpus_to_add"
    ]
   },
   {
@@ -1588,7 +1599,7 @@
    "source": [
     "# Option 2: Append the new CPUs to the original df\n",
     "df = pd.concat([df, new_cpus_to_add], ignore_index=True)\n",
-    "df.sort_values('Name', ascending=True, inplace=True)\n",
+    "df.sort_values(\"Name\", ascending=True, inplace=True)\n",
     "df.query('Name.str.contains(\"AMD EPYC 4124P\")')"
    ]
   },
@@ -1598,7 +1609,36 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df.to_csv('cpu_power.csv', index=False)"
+    "df.to_csv(\"cpu_power.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Remove with..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "# Load the existing dataset\n",
+    "df = pd.read_csv(\"../cpu_power.csv\")\n",
+    "# Replace with re.sub(r\" with.*\", \"\", name)\n",
+    "def clean_cpu_name(name):\n",
+    "    import re\n",
+    "    # Remove \"with\" and everything after it\n",
+    "    name = re.sub(r\" with.*\", \"\", name)\n",
+    "    # Remove \"™\" symbol\n",
+    "    name = name.replace(\"™\", \"\")\n",
+    "    return name.strip()\n",
+    "df[\"Name\"] = df[\"Name\"].apply(clean_cpu_name)\n",
+    "# Save the cleaned dataset\n",
+    "df.to_csv(\"../cpu_power.csv\", index=False)"
    ]
   },
   {
@@ -1609,7 +1649,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "codecarbon",
+   "display_name": "3.10.5",
    "language": "python",
    "name": "python3"
   },
@@ -1623,7 +1663,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.10.5"
   }
  },
  "nbformat": 4,

diff --git a/codecarbon/data/hardware/cpu_dataset_builder/README.md b/codecarbon/data/hardware/cpu_dataset_builder/README.md
@@ -0,0 +1,15 @@
+# How to update the CPU database
+
+To update the CPU database, you have to run:
+
+```bash
+cd codecarbon/data/hardware/cpu_dataset_builder
+hatch run pip install playwright beautifulsoup4
+hatch run python intel_cpu_scrapper.py
+hatch run python amd_cpu_scrapper.py
+hatch run python merge_scrapped_cpu_power.py
+```
+
+Then commit the changes to the CSV files.
+
+CodeCarbon only use the `cpu_power.csv` file, but we keep the other files for reference and to allow someone else to use them if needed.