From 0a143bfdd55ea3d1f6f556f71f39375ea3e5027b Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 27 Jun 2025 14:21:14 +0200 Subject: [PATCH 1/4] style: change order, first json, then csv Making this change because in Python it doesn't matter and in JavaScript it's easier to start with JSON, which is built-in, and only then move to CSV, which requires an additional library. --- .../scraping_basics_python/08_saving_data.md | 144 +++++++++--------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md index 8d8690c2c..08e47fa71 100644 --- a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md @@ -78,65 +78,11 @@ If you find the complex data structures printed by `print()` difficult to read, ::: -## Saving data as CSV - -The CSV format is popular among data analysts because a wide range of tools can import it, including spreadsheets apps like LibreOffice Calc, Microsoft Excel, Apple Numbers, and Google Sheets. - -In Python, it's convenient to read and write CSV files, thanks to the [`csv`](https://docs.python.org/3/library/csv.html) standard library module. First let's try something small in the Python's interactive REPL to familiarize ourselves with the basic usage: - -```py ->>> import csv ->>> with open("data.csv", "w") as file: -... writer = csv.DictWriter(file, fieldnames=["name", "age", "hobbies"]) -... writer.writeheader() -... writer.writerow({"name": "Alice", "age": 24, "hobbies": "kickbox, Python"}) -... writer.writerow({"name": "Bob", "age": 42, "hobbies": "reading, TypeScript"}) -... -``` - -We first opened a new file for writing and created a `DictWriter()` instance with the expected field names. We instructed it to write the header row first and then added two more rows containing actual data. The code produced a `data.csv` file in the same directory where we're running the REPL. It has the following contents: - -```csv title=data.csv -name,age,hobbies -Alice,24,"kickbox, Python" -Bob,42,"reading, TypeScript" -``` - -In the CSV format, if values contain commas, we should enclose them in quotes. You can see that the writer automatically handled this. - -When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it as well. If you're using a different operating system, try opening the file with any spreadsheet program you have. - -![CSV example preview](images/csv-example.png) - -Now that's nice, but we didn't want Alice, Bob, kickbox, or TypeScript. What we actually want is a CSV containing `Sony XBR-950G BRAVIA 4K HDR Ultra HD TV`, right? Let's do this! First, let's add `csv` to our imports: - -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal -# highlight-next-line -import csv -``` - -Next, instead of printing the data, we'll finish the program by exporting it to CSV. Replace `print(data)` with the following: - -```py -with open("products.csv", "w") as file: - writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"]) - writer.writeheader() - for row in data: - writer.writerow(row) -``` - -If we run our scraper now, it won't display any output, but it will create a `products.csv` file in the current working directory, which contains all the data about the listed products. - -![CSV preview](images/csv.png) - ## Saving data as JSON The JSON format is popular primarily among developers. We use it for storing data, configuration files, or as a way to transfer data between programs (e.g., APIs). Its origin stems from the syntax of objects in the JavaScript programming language, which is similar to the syntax of Python dictionaries. -In Python, there's a [`json`](https://docs.python.org/3/library/json.html) standard library module, which is so straightforward that we can start using it in our code right away. We'll need to begin with imports: +In Python, we can read and write JSON using the [`json`](https://docs.python.org/3/library/json.html) standard library module. We'll begin with imports: ```py import httpx @@ -147,14 +93,14 @@ import csv import json ``` -Next, let’s append one more export to end of the source code of our scraper: +Next, instead of printing the data, we'll finish the program by exporting it to JSON. Replace `print(data)` with the following: ```py with open("products.json", "w") as file: json.dump(data, file) ``` -That’s it! If we run the program now, it should also create a `products.json` file in the current working directory: +That's it! If we run the program now, it should also create a `products.json` file in the current working directory: ```text $ python main.py @@ -176,7 +122,7 @@ with open("products.json", "w") as file: json.dump(data, file, default=serialize) ``` -Now the program should work as expected, producing a JSON file with the following content: +If we run our scraper now, it won't display any output, but it will create a `products.json` file in the current working directory, which contains all the data about the listed products: ```json title=products.json @@ -197,30 +143,67 @@ Also, if your data contains non-English characters, set `ensure_ascii=False`. By ::: -We've built a Python application that downloads a product listing, parses the data, and saves it in a structured format for further use. But the data still has gaps: for some products, we only have the min price, not the actual prices. In the next lesson, we'll attempt to scrape more details from all the product pages. +## Saving data as CSV ---- +The CSV format is popular among data analysts because a wide range of tools can import it, including spreadsheets apps like LibreOffice Calc, Microsoft Excel, Apple Numbers, and Google Sheets. -## Exercises +In Python, we can read and write CSV using the [`csv`](https://docs.python.org/3/library/csv.html) standard library module. First let's try something small in the Python's interactive REPL to familiarize ourselves with the basic usage: -In this lesson, you learned how to create export files in two formats. The following challenges are designed to help you empathize with the people who'd be working with them. +```py +>>> import csv +>>> with open("data.csv", "w") as file: +... writer = csv.DictWriter(file, fieldnames=["name", "age", "hobbies"]) +... writer.writeheader() +... writer.writerow({"name": "Alice", "age": 24, "hobbies": "kickbox, Python"}) +... writer.writerow({"name": "Bob", "age": 42, "hobbies": "reading, TypeScript"}) +... +``` -### Process your CSV +We first opened a new file for writing and created a `DictWriter()` instance with the expected field names. We instructed it to write the header row first and then added two more rows containing actual data. The code produced a `data.csv` file in the same directory where we're running the REPL. It has the following contents: -Open the `products.csv` file in a spreadsheet app. Use the app to find all products with a min price greater than $500. +```csv title=data.csv +name,age,hobbies +Alice,24,"kickbox, Python" +Bob,42,"reading, TypeScript" +``` -
- Solution +In the CSV format, if values contain commas, we should enclose them in quotes. You can see that the writer automatically handled this. - Let's use [Google Sheets](https://www.google.com/sheets/about/), which is free to use. After logging in with a Google account: +When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it as well. If you're using a different operating system, try opening the file with any spreadsheet program you have. - 1. Go to **File > Import**, choose **Upload**, and select the file. Import the data using the default settings. You should see a table with all the data. - 2. Select the header row. Go to **Data > Create filter**. - 3. Use the filter icon that appears next to `min_price`. Choose **Filter by condition**, select **Greater than**, and enter **500** in the text field. Confirm the dialog. You should see only the filtered data. +![CSV example preview](images/csv-example.png) - ![CSV in Google Sheets](images/csv-sheets.png) +Now that's nice, but we didn't want Alice, Bob, kickbox, or TypeScript. What we actually want is a CSV containing `Sony XBR-950G BRAVIA 4K HDR Ultra HD TV`, right? Let's do this! First, let's add `csv` to our imports: -
+```py +import httpx +from bs4 import BeautifulSoup +from decimal import Decimal +# highlight-next-line +import csv +``` + +Next, let’s append one more export to end of the source code of our scraper: + +```py +with open("products.csv", "w") as file: + writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"]) + writer.writeheader() + for row in data: + writer.writerow(row) +``` + +Now the program should work as expected, producing a CSV file with the following content: + +![CSV preview](images/csv.png) + +We've built a Python application that downloads a product listing, parses the data, and saves it in a structured format for further use. But the data still has gaps: for some products, we only have the min price, not the actual prices. In the next lesson, we'll attempt to scrape more details from all the product pages. + +--- + +## Exercises + +In this lesson, you learned how to create export files in two formats. The following challenges are designed to help you empathize with the people who'd be working with them. ### Process your JSON @@ -243,3 +226,20 @@ Write a new Python program that reads `products.json`, finds all products with a ``` + +### Process your CSV + +Open the `products.csv` file we created in the lesson using a spreadsheet application. Then, in the app, find all products with a min price greater than $500. + +
+ Solution + + Let's use [Google Sheets](https://www.google.com/sheets/about/), which is free to use. After logging in with a Google account: + + 1. Go to **File > Import**, choose **Upload**, and select the file. Import the data using the default settings. You should see a table with all the data. + 2. Select the header row. Go to **Data > Create filter**. + 3. Use the filter icon that appears next to `min_price`. Choose **Filter by condition**, select **Greater than**, and enter **500** in the text field. Confirm the dialog. You should see only the filtered data. + + ![CSV in Google Sheets](images/csv-sheets.png) + +
From 0b737fa620d50b33638487afbcfaf5e398a16c27 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 27 Jun 2025 15:41:23 +0200 Subject: [PATCH 2/4] fix: various improvements to the Python lesson about saving data --- .../scraping_basics_python/08_saving_data.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md index 08e47fa71..315a07b03 100644 --- a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md @@ -93,7 +93,7 @@ import csv import json ``` -Next, instead of printing the data, we'll finish the program by exporting it to JSON. Replace `print(data)` with the following: +Next, instead of printing the data, we'll finish the program by exporting it to JSON. Let's replace the line `print(data)` with the following: ```py with open("products.json", "w") as file: @@ -167,9 +167,9 @@ Alice,24,"kickbox, Python" Bob,42,"reading, TypeScript" ``` -In the CSV format, if values contain commas, we should enclose them in quotes. You can see that the writer automatically handled this. +In the CSV format, if a value contains commas, we should enclose it in quotes. When we open the file in a text editor of our choice, we can see that the writer automatically handled this. -When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it as well. If you're using a different operating system, try opening the file with any spreadsheet program you have. +When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it. If you're using a different operating system, try opening the file with any spreadsheet program you have. ![CSV example preview](images/csv-example.png) @@ -183,7 +183,7 @@ from decimal import Decimal import csv ``` -Next, let’s append one more export to end of the source code of our scraper: +Next, let's add one more data export to end of the source code of our scraper: ```py with open("products.csv", "w") as file: @@ -193,7 +193,7 @@ with open("products.csv", "w") as file: writer.writerow(row) ``` -Now the program should work as expected, producing a CSV file with the following content: +The program should now also produce a CSV file with the following content: ![CSV preview](images/csv.png) @@ -203,7 +203,7 @@ We've built a Python application that downloads a product listing, parses the da ## Exercises -In this lesson, you learned how to create export files in two formats. The following challenges are designed to help you empathize with the people who'd be working with them. +In this lesson, we created export files in two formats. The following challenges are designed to help you empathize with the people who'd be working with them. ### Process your JSON From 238426c622e69d5b22adfd05c3cecf69e2a78d43 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 30 Jun 2025 10:16:07 +0200 Subject: [PATCH 3/4] fix: change order of JSON and CSV, fix some small errors --- .../09_getting_links.md | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md index a8444defa..0b32050df 100644 --- a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md @@ -115,7 +115,20 @@ def parse_product(product): return {"title": title, "min_price": min_price, "price": price} ``` -Now the CSV export. We'll make a small change here. Having to specify the field names is not ideal. What if we add more field names in the parsing function? We'd always have to remember to go and edit the export function as well. If we could figure out the field names in place, we'd remove this dependency. One way would be to infer the field names from the dictionary keys of the first row: +Now the JSON export. For better readability of it, let's make a small change here and set the indentation level to two spaces: + +```py +def export_json(file, data): + def serialize(obj): + if isinstance(obj, Decimal): + return str(obj) + raise TypeError("Object not JSON serializable") + + # highlight-next-line + json.dump(data, file, default=serialize, indent=2) +``` + +The last function we'll add will take care of the CSV export. We'll make a small change here as well. Having to specify the field names is not ideal. What if we add more field names in the parsing function? We'd always have to remember to go and edit the export function as well. If we could figure out the field names in place, we'd remove this dependency. One way would be to infer the field names from the dictionary keys of the first row: ```py def export_csv(file, data): @@ -133,19 +146,6 @@ The code above assumes the `data` variable contains at least one item, and that ::: -The last function we'll add will take care of the JSON export. For better readability of the JSON export, let's make a small change here too and set the indentation level to two spaces: - -```py -def export_json(file, data): - def serialize(obj): - if isinstance(obj, Decimal): - return str(obj) - raise TypeError("Object not JSON serializable") - - # highlight-next-line - json.dump(data, file, default=serialize, indent=2) -``` - Now let's put it all together: ```py @@ -406,8 +406,8 @@ https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-u from bs4 import BeautifulSoup from urllib.parse import urljoin - url = "https://www.theguardian.com/sport/formulaone" - response = httpx.get(url) + listing_url = "https://www.theguardian.com/sport/formulaone" + response = httpx.get(listing_url) response.raise_for_status() html_code = response.text @@ -415,7 +415,7 @@ https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-u for item in soup.select("#maincontent ul li"): link = item.select_one("a") - url = urljoin(url, link["href"]) + url = urljoin(listing_url, link["href"]) print(url) ``` From 07ce477319698a19361656ebb9ca09bcfa926f08 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 30 Jun 2025 10:27:45 +0200 Subject: [PATCH 4/4] fix: reorder JSON and CSV in the other lessons as well --- .../09_getting_links.md | 6 +-- .../10_crawling.md | 2 +- .../11_scraping_variants.md | 2 +- .../scraping_basics_python/08_saving_data.md | 10 ++++- .../09_getting_links.md | 38 +++++++++---------- .../scraping_basics_python/10_crawling.md | 22 +++++------ .../11_scraping_variants.md | 22 +++++------ 7 files changed, 55 insertions(+), 47 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md index 9d2a41333..76b083342 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md @@ -35,8 +35,8 @@ Over the course of the previous lessons, the code of our program grew to almost import httpx from bs4 import BeautifulSoup from decimal import Decimal -import csv import json +import csv url = "https://warehouse-theme-metal.myshopify.com/collections/sales" response = httpx.get(url) @@ -153,8 +153,8 @@ Now let's put it all together: import httpx from bs4 import BeautifulSoup from decimal import Decimal -import csv import json +import csv def download(url): response = httpx.get(url) @@ -279,8 +279,8 @@ Browsers reading the HTML know the base address and automatically resolve such l import httpx from bs4 import BeautifulSoup from decimal import Decimal -import csv import json +import csv # highlight-next-line from urllib.parse import urljoin ``` diff --git a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md index f46b0ec63..88f0c023f 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md @@ -20,8 +20,8 @@ Thanks to the refactoring, we have functions ready for each of the tasks, so we import httpx from bs4 import BeautifulSoup from decimal import Decimal -import csv import json +import csv from urllib.parse import urljoin def download(url): diff --git a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md index 0c68ea5b7..7c67de5f2 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md @@ -193,8 +193,8 @@ Now, if we use our new function, we should finally get a program that can scrape import httpx from bs4 import BeautifulSoup from decimal import Decimal -import csv import json +import csv from urllib.parse import urljoin def download(url): diff --git a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md index 315a07b03..6567e24ef 100644 --- a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md @@ -88,7 +88,6 @@ In Python, we can read and write JSON using the [`json`](https://docs.python.org import httpx from bs4 import BeautifulSoup from decimal import Decimal -import csv # highlight-next-line import json ``` @@ -179,6 +178,7 @@ Now that's nice, but we didn't want Alice, Bob, kickbox, or TypeScript. What we import httpx from bs4 import BeautifulSoup from decimal import Decimal +import json # highlight-next-line import csv ``` @@ -186,6 +186,14 @@ import csv Next, let's add one more data export to end of the source code of our scraper: ```py +def serialize(obj): + if isinstance(obj, Decimal): + return str(obj) + raise TypeError("Object not JSON serializable") + +with open("products.json", "w") as file: + json.dump(data, file, default=serialize) + with open("products.csv", "w") as file: writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"]) writer.writeheader() diff --git a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md index 0b32050df..483958c22 100644 --- a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md @@ -34,8 +34,8 @@ Over the course of the previous lessons, the code of our program grew to almost import httpx from bs4 import BeautifulSoup from decimal import Decimal -import csv import json +import csv url = "https://warehouse-theme-metal.myshopify.com/collections/sales" response = httpx.get(url) @@ -65,12 +65,6 @@ for product in soup.select(".product-item"): data.append({"title": title, "min_price": min_price, "price": price}) -with open("products.csv", "w") as file: - writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"]) - writer.writeheader() - for row in data: - writer.writerow(row) - def serialize(obj): if isinstance(obj, Decimal): return str(obj) @@ -78,6 +72,12 @@ def serialize(obj): with open("products.json", "w") as file: json.dump(data, file, default=serialize) + +with open("products.csv", "w") as file: + writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"]) + writer.writeheader() + for row in data: + writer.writerow(row) ``` Let's introduce several functions to make the whole thing easier to digest. First, we can turn the beginning of our program into this `download()` function, which takes a URL and returns a `BeautifulSoup` instance: @@ -152,8 +152,8 @@ Now let's put it all together: import httpx from bs4 import BeautifulSoup from decimal import Decimal -import csv import json +import csv def download(url): response = httpx.get(url) @@ -182,13 +182,6 @@ def parse_product(product): return {"title": title, "min_price": min_price, "price": price} -def export_csv(file, data): - fieldnames = list(data[0].keys()) - writer = csv.DictWriter(file, fieldnames=fieldnames) - writer.writeheader() - for row in data: - writer.writerow(row) - def export_json(file, data): def serialize(obj): if isinstance(obj, Decimal): @@ -197,6 +190,13 @@ def export_json(file, data): json.dump(data, file, default=serialize, indent=2) +def export_csv(file, data): + fieldnames = list(data[0].keys()) + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + for row in data: + writer.writerow(row) + listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales" listing_soup = download(listing_url) @@ -205,11 +205,11 @@ for product in listing_soup.select(".product-item"): item = parse_product(product) data.append(item) -with open("products.csv", "w") as file: - export_csv(file, data) - with open("products.json", "w") as file: export_json(file, data) + +with open("products.csv", "w") as file: + export_csv(file, data) ``` The program is much easier to read now. With the `parse_product()` function handy, we could also replace the convoluted loop with one that only takes up four lines of code. @@ -278,8 +278,8 @@ Browsers reading the HTML know the base address and automatically resolve such l import httpx from bs4 import BeautifulSoup from decimal import Decimal -import csv import json +import csv # highlight-next-line from urllib.parse import urljoin ``` diff --git a/sources/academy/webscraping/scraping_basics_python/10_crawling.md b/sources/academy/webscraping/scraping_basics_python/10_crawling.md index 158398e31..dc4d8cee2 100644 --- a/sources/academy/webscraping/scraping_basics_python/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_python/10_crawling.md @@ -19,8 +19,8 @@ Thanks to the refactoring, we have functions ready for each of the tasks, so we import httpx from bs4 import BeautifulSoup from decimal import Decimal -import csv import json +import csv from urllib.parse import urljoin def download(url): @@ -52,13 +52,6 @@ def parse_product(product, base_url): return {"title": title, "min_price": min_price, "price": price, "url": url} -def export_csv(file, data): - fieldnames = list(data[0].keys()) - writer = csv.DictWriter(file, fieldnames=fieldnames) - writer.writeheader() - for row in data: - writer.writerow(row) - def export_json(file, data): def serialize(obj): if isinstance(obj, Decimal): @@ -67,6 +60,13 @@ def export_json(file, data): json.dump(data, file, default=serialize, indent=2) +def export_csv(file, data): + fieldnames = list(data[0].keys()) + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + for row in data: + writer.writerow(row) + listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales" listing_soup = download(listing_url) @@ -75,11 +75,11 @@ for product in listing_soup.select(".product-item"): item = parse_product(product, listing_url) data.append(item) -with open("products.csv", "w") as file: - export_csv(file, data) - with open("products.json", "w") as file: export_json(file, data) + +with open("products.csv", "w") as file: + export_csv(file, data) ``` ## Extracting vendor name diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md index 7f0322e64..2d8b9e822 100644 --- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md @@ -192,8 +192,8 @@ Now, if we use our new function, we should finally get a program that can scrape import httpx from bs4 import BeautifulSoup from decimal import Decimal -import csv import json +import csv from urllib.parse import urljoin def download(url): @@ -235,13 +235,6 @@ def parse_variant(variant): ) return {"variant_name": name, "price": price} -def export_csv(file, data): - fieldnames = list(data[0].keys()) - writer = csv.DictWriter(file, fieldnames=fieldnames) - writer.writeheader() - for row in data: - writer.writerow(row) - def export_json(file, data): def serialize(obj): if isinstance(obj, Decimal): @@ -250,6 +243,13 @@ def export_json(file, data): json.dump(data, file, default=serialize, indent=2) +def export_csv(file, data): + fieldnames = list(data[0].keys()) + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + for row in data: + writer.writerow(row) + listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales" listing_soup = download(listing_url) @@ -267,11 +267,11 @@ for product in listing_soup.select(".product-item"): item["variant_name"] = None data.append(item) -with open("products.csv", "w") as file: - export_csv(file, data) - with open("products.json", "w") as file: export_json(file, data) + +with open("products.csv", "w") as file: + export_csv(file, data) ``` Let's run the scraper and see if all the items in the data contain prices: