diff --git a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md
index 9d2a41333..76b083342 100644
--- a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md
+++ b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md
@@ -35,8 +35,8 @@ Over the course of the previous lessons, the code of our program grew to almost
import httpx
from bs4 import BeautifulSoup
from decimal import Decimal
-import csv
import json
+import csv
url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
response = httpx.get(url)
@@ -153,8 +153,8 @@ Now let's put it all together:
import httpx
from bs4 import BeautifulSoup
from decimal import Decimal
-import csv
import json
+import csv
def download(url):
response = httpx.get(url)
@@ -279,8 +279,8 @@ Browsers reading the HTML know the base address and automatically resolve such l
import httpx
from bs4 import BeautifulSoup
from decimal import Decimal
-import csv
import json
+import csv
# highlight-next-line
from urllib.parse import urljoin
```
diff --git a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md
index f46b0ec63..88f0c023f 100644
--- a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md
+++ b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md
@@ -20,8 +20,8 @@ Thanks to the refactoring, we have functions ready for each of the tasks, so we
import httpx
from bs4 import BeautifulSoup
from decimal import Decimal
-import csv
import json
+import csv
from urllib.parse import urljoin
def download(url):
diff --git a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md
index 0c68ea5b7..7c67de5f2 100644
--- a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md
+++ b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md
@@ -193,8 +193,8 @@ Now, if we use our new function, we should finally get a program that can scrape
import httpx
from bs4 import BeautifulSoup
from decimal import Decimal
-import csv
import json
+import csv
from urllib.parse import urljoin
def download(url):
diff --git a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md
index 8d8690c2c..6567e24ef 100644
--- a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md
+++ b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md
@@ -78,83 +78,28 @@ If you find the complex data structures printed by `print()` difficult to read,
:::
-## Saving data as CSV
-
-The CSV format is popular among data analysts because a wide range of tools can import it, including spreadsheets apps like LibreOffice Calc, Microsoft Excel, Apple Numbers, and Google Sheets.
-
-In Python, it's convenient to read and write CSV files, thanks to the [`csv`](https://docs.python.org/3/library/csv.html) standard library module. First let's try something small in the Python's interactive REPL to familiarize ourselves with the basic usage:
-
-```py
->>> import csv
->>> with open("data.csv", "w") as file:
-... writer = csv.DictWriter(file, fieldnames=["name", "age", "hobbies"])
-... writer.writeheader()
-... writer.writerow({"name": "Alice", "age": 24, "hobbies": "kickbox, Python"})
-... writer.writerow({"name": "Bob", "age": 42, "hobbies": "reading, TypeScript"})
-...
-```
-
-We first opened a new file for writing and created a `DictWriter()` instance with the expected field names. We instructed it to write the header row first and then added two more rows containing actual data. The code produced a `data.csv` file in the same directory where we're running the REPL. It has the following contents:
-
-```csv title=data.csv
-name,age,hobbies
-Alice,24,"kickbox, Python"
-Bob,42,"reading, TypeScript"
-```
-
-In the CSV format, if values contain commas, we should enclose them in quotes. You can see that the writer automatically handled this.
-
-When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it as well. If you're using a different operating system, try opening the file with any spreadsheet program you have.
-
-
-
-Now that's nice, but we didn't want Alice, Bob, kickbox, or TypeScript. What we actually want is a CSV containing `Sony XBR-950G BRAVIA 4K HDR Ultra HD TV`, right? Let's do this! First, let's add `csv` to our imports:
-
-```py
-import httpx
-from bs4 import BeautifulSoup
-from decimal import Decimal
-# highlight-next-line
-import csv
-```
-
-Next, instead of printing the data, we'll finish the program by exporting it to CSV. Replace `print(data)` with the following:
-
-```py
-with open("products.csv", "w") as file:
- writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"])
- writer.writeheader()
- for row in data:
- writer.writerow(row)
-```
-
-If we run our scraper now, it won't display any output, but it will create a `products.csv` file in the current working directory, which contains all the data about the listed products.
-
-
-
## Saving data as JSON
The JSON format is popular primarily among developers. We use it for storing data, configuration files, or as a way to transfer data between programs (e.g., APIs). Its origin stems from the syntax of objects in the JavaScript programming language, which is similar to the syntax of Python dictionaries.
-In Python, there's a [`json`](https://docs.python.org/3/library/json.html) standard library module, which is so straightforward that we can start using it in our code right away. We'll need to begin with imports:
+In Python, we can read and write JSON using the [`json`](https://docs.python.org/3/library/json.html) standard library module. We'll begin with imports:
```py
import httpx
from bs4 import BeautifulSoup
from decimal import Decimal
-import csv
# highlight-next-line
import json
```
-Next, let’s append one more export to end of the source code of our scraper:
+Next, instead of printing the data, we'll finish the program by exporting it to JSON. Let's replace the line `print(data)` with the following:
```py
with open("products.json", "w") as file:
json.dump(data, file)
```
-That’s it! If we run the program now, it should also create a `products.json` file in the current working directory:
+That's it! If we run the program now, it should also create a `products.json` file in the current working directory:
```text
$ python main.py
@@ -176,7 +121,7 @@ with open("products.json", "w") as file:
json.dump(data, file, default=serialize)
```
-Now the program should work as expected, producing a JSON file with the following content:
+If we run our scraper now, it won't display any output, but it will create a `products.json` file in the current working directory, which contains all the data about the listed products:
```json title=products.json
@@ -197,30 +142,76 @@ Also, if your data contains non-English characters, set `ensure_ascii=False`. By
:::
-We've built a Python application that downloads a product listing, parses the data, and saves it in a structured format for further use. But the data still has gaps: for some products, we only have the min price, not the actual prices. In the next lesson, we'll attempt to scrape more details from all the product pages.
+## Saving data as CSV
----
+The CSV format is popular among data analysts because a wide range of tools can import it, including spreadsheets apps like LibreOffice Calc, Microsoft Excel, Apple Numbers, and Google Sheets.
-## Exercises
+In Python, we can read and write CSV using the [`csv`](https://docs.python.org/3/library/csv.html) standard library module. First let's try something small in the Python's interactive REPL to familiarize ourselves with the basic usage:
-In this lesson, you learned how to create export files in two formats. The following challenges are designed to help you empathize with the people who'd be working with them.
+```py
+>>> import csv
+>>> with open("data.csv", "w") as file:
+... writer = csv.DictWriter(file, fieldnames=["name", "age", "hobbies"])
+... writer.writeheader()
+... writer.writerow({"name": "Alice", "age": 24, "hobbies": "kickbox, Python"})
+... writer.writerow({"name": "Bob", "age": 42, "hobbies": "reading, TypeScript"})
+...
+```
-### Process your CSV
+We first opened a new file for writing and created a `DictWriter()` instance with the expected field names. We instructed it to write the header row first and then added two more rows containing actual data. The code produced a `data.csv` file in the same directory where we're running the REPL. It has the following contents:
-Open the `products.csv` file in a spreadsheet app. Use the app to find all products with a min price greater than $500.
+```csv title=data.csv
+name,age,hobbies
+Alice,24,"kickbox, Python"
+Bob,42,"reading, TypeScript"
+```
-
- Solution
+In the CSV format, if a value contains commas, we should enclose it in quotes. When we open the file in a text editor of our choice, we can see that the writer automatically handled this.
- Let's use [Google Sheets](https://www.google.com/sheets/about/), which is free to use. After logging in with a Google account:
+When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it. If you're using a different operating system, try opening the file with any spreadsheet program you have.
- 1. Go to **File > Import**, choose **Upload**, and select the file. Import the data using the default settings. You should see a table with all the data.
- 2. Select the header row. Go to **Data > Create filter**.
- 3. Use the filter icon that appears next to `min_price`. Choose **Filter by condition**, select **Greater than**, and enter **500** in the text field. Confirm the dialog. You should see only the filtered data.
+
- 
+Now that's nice, but we didn't want Alice, Bob, kickbox, or TypeScript. What we actually want is a CSV containing `Sony XBR-950G BRAVIA 4K HDR Ultra HD TV`, right? Let's do this! First, let's add `csv` to our imports:
-
+```py
+import httpx
+from bs4 import BeautifulSoup
+from decimal import Decimal
+import json
+# highlight-next-line
+import csv
+```
+
+Next, let's add one more data export to end of the source code of our scraper:
+
+```py
+def serialize(obj):
+ if isinstance(obj, Decimal):
+ return str(obj)
+ raise TypeError("Object not JSON serializable")
+
+with open("products.json", "w") as file:
+ json.dump(data, file, default=serialize)
+
+with open("products.csv", "w") as file:
+ writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"])
+ writer.writeheader()
+ for row in data:
+ writer.writerow(row)
+```
+
+The program should now also produce a CSV file with the following content:
+
+
+
+We've built a Python application that downloads a product listing, parses the data, and saves it in a structured format for further use. But the data still has gaps: for some products, we only have the min price, not the actual prices. In the next lesson, we'll attempt to scrape more details from all the product pages.
+
+---
+
+## Exercises
+
+In this lesson, we created export files in two formats. The following challenges are designed to help you empathize with the people who'd be working with them.
### Process your JSON
@@ -243,3 +234,20 @@ Write a new Python program that reads `products.json`, finds all products with a
```
+
+### Process your CSV
+
+Open the `products.csv` file we created in the lesson using a spreadsheet application. Then, in the app, find all products with a min price greater than $500.
+
+
+ Solution
+
+ Let's use [Google Sheets](https://www.google.com/sheets/about/), which is free to use. After logging in with a Google account:
+
+ 1. Go to **File > Import**, choose **Upload**, and select the file. Import the data using the default settings. You should see a table with all the data.
+ 2. Select the header row. Go to **Data > Create filter**.
+ 3. Use the filter icon that appears next to `min_price`. Choose **Filter by condition**, select **Greater than**, and enter **500** in the text field. Confirm the dialog. You should see only the filtered data.
+
+ 
+
+
diff --git a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md
index a8444defa..483958c22 100644
--- a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md
+++ b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md
@@ -34,8 +34,8 @@ Over the course of the previous lessons, the code of our program grew to almost
import httpx
from bs4 import BeautifulSoup
from decimal import Decimal
-import csv
import json
+import csv
url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
response = httpx.get(url)
@@ -65,12 +65,6 @@ for product in soup.select(".product-item"):
data.append({"title": title, "min_price": min_price, "price": price})
-with open("products.csv", "w") as file:
- writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"])
- writer.writeheader()
- for row in data:
- writer.writerow(row)
-
def serialize(obj):
if isinstance(obj, Decimal):
return str(obj)
@@ -78,6 +72,12 @@ def serialize(obj):
with open("products.json", "w") as file:
json.dump(data, file, default=serialize)
+
+with open("products.csv", "w") as file:
+ writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"])
+ writer.writeheader()
+ for row in data:
+ writer.writerow(row)
```
Let's introduce several functions to make the whole thing easier to digest. First, we can turn the beginning of our program into this `download()` function, which takes a URL and returns a `BeautifulSoup` instance:
@@ -115,7 +115,20 @@ def parse_product(product):
return {"title": title, "min_price": min_price, "price": price}
```
-Now the CSV export. We'll make a small change here. Having to specify the field names is not ideal. What if we add more field names in the parsing function? We'd always have to remember to go and edit the export function as well. If we could figure out the field names in place, we'd remove this dependency. One way would be to infer the field names from the dictionary keys of the first row:
+Now the JSON export. For better readability of it, let's make a small change here and set the indentation level to two spaces:
+
+```py
+def export_json(file, data):
+ def serialize(obj):
+ if isinstance(obj, Decimal):
+ return str(obj)
+ raise TypeError("Object not JSON serializable")
+
+ # highlight-next-line
+ json.dump(data, file, default=serialize, indent=2)
+```
+
+The last function we'll add will take care of the CSV export. We'll make a small change here as well. Having to specify the field names is not ideal. What if we add more field names in the parsing function? We'd always have to remember to go and edit the export function as well. If we could figure out the field names in place, we'd remove this dependency. One way would be to infer the field names from the dictionary keys of the first row:
```py
def export_csv(file, data):
@@ -133,27 +146,14 @@ The code above assumes the `data` variable contains at least one item, and that
:::
-The last function we'll add will take care of the JSON export. For better readability of the JSON export, let's make a small change here too and set the indentation level to two spaces:
-
-```py
-def export_json(file, data):
- def serialize(obj):
- if isinstance(obj, Decimal):
- return str(obj)
- raise TypeError("Object not JSON serializable")
-
- # highlight-next-line
- json.dump(data, file, default=serialize, indent=2)
-```
-
Now let's put it all together:
```py
import httpx
from bs4 import BeautifulSoup
from decimal import Decimal
-import csv
import json
+import csv
def download(url):
response = httpx.get(url)
@@ -182,13 +182,6 @@ def parse_product(product):
return {"title": title, "min_price": min_price, "price": price}
-def export_csv(file, data):
- fieldnames = list(data[0].keys())
- writer = csv.DictWriter(file, fieldnames=fieldnames)
- writer.writeheader()
- for row in data:
- writer.writerow(row)
-
def export_json(file, data):
def serialize(obj):
if isinstance(obj, Decimal):
@@ -197,6 +190,13 @@ def export_json(file, data):
json.dump(data, file, default=serialize, indent=2)
+def export_csv(file, data):
+ fieldnames = list(data[0].keys())
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
+ writer.writeheader()
+ for row in data:
+ writer.writerow(row)
+
listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
listing_soup = download(listing_url)
@@ -205,11 +205,11 @@ for product in listing_soup.select(".product-item"):
item = parse_product(product)
data.append(item)
-with open("products.csv", "w") as file:
- export_csv(file, data)
-
with open("products.json", "w") as file:
export_json(file, data)
+
+with open("products.csv", "w") as file:
+ export_csv(file, data)
```
The program is much easier to read now. With the `parse_product()` function handy, we could also replace the convoluted loop with one that only takes up four lines of code.
@@ -278,8 +278,8 @@ Browsers reading the HTML know the base address and automatically resolve such l
import httpx
from bs4 import BeautifulSoup
from decimal import Decimal
-import csv
import json
+import csv
# highlight-next-line
from urllib.parse import urljoin
```
@@ -406,8 +406,8 @@ https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-u
from bs4 import BeautifulSoup
from urllib.parse import urljoin
- url = "https://www.theguardian.com/sport/formulaone"
- response = httpx.get(url)
+ listing_url = "https://www.theguardian.com/sport/formulaone"
+ response = httpx.get(listing_url)
response.raise_for_status()
html_code = response.text
@@ -415,7 +415,7 @@ https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-u
for item in soup.select("#maincontent ul li"):
link = item.select_one("a")
- url = urljoin(url, link["href"])
+ url = urljoin(listing_url, link["href"])
print(url)
```
diff --git a/sources/academy/webscraping/scraping_basics_python/10_crawling.md b/sources/academy/webscraping/scraping_basics_python/10_crawling.md
index 158398e31..dc4d8cee2 100644
--- a/sources/academy/webscraping/scraping_basics_python/10_crawling.md
+++ b/sources/academy/webscraping/scraping_basics_python/10_crawling.md
@@ -19,8 +19,8 @@ Thanks to the refactoring, we have functions ready for each of the tasks, so we
import httpx
from bs4 import BeautifulSoup
from decimal import Decimal
-import csv
import json
+import csv
from urllib.parse import urljoin
def download(url):
@@ -52,13 +52,6 @@ def parse_product(product, base_url):
return {"title": title, "min_price": min_price, "price": price, "url": url}
-def export_csv(file, data):
- fieldnames = list(data[0].keys())
- writer = csv.DictWriter(file, fieldnames=fieldnames)
- writer.writeheader()
- for row in data:
- writer.writerow(row)
-
def export_json(file, data):
def serialize(obj):
if isinstance(obj, Decimal):
@@ -67,6 +60,13 @@ def export_json(file, data):
json.dump(data, file, default=serialize, indent=2)
+def export_csv(file, data):
+ fieldnames = list(data[0].keys())
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
+ writer.writeheader()
+ for row in data:
+ writer.writerow(row)
+
listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
listing_soup = download(listing_url)
@@ -75,11 +75,11 @@ for product in listing_soup.select(".product-item"):
item = parse_product(product, listing_url)
data.append(item)
-with open("products.csv", "w") as file:
- export_csv(file, data)
-
with open("products.json", "w") as file:
export_json(file, data)
+
+with open("products.csv", "w") as file:
+ export_csv(file, data)
```
## Extracting vendor name
diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md
index 7f0322e64..2d8b9e822 100644
--- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md
+++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md
@@ -192,8 +192,8 @@ Now, if we use our new function, we should finally get a program that can scrape
import httpx
from bs4 import BeautifulSoup
from decimal import Decimal
-import csv
import json
+import csv
from urllib.parse import urljoin
def download(url):
@@ -235,13 +235,6 @@ def parse_variant(variant):
)
return {"variant_name": name, "price": price}
-def export_csv(file, data):
- fieldnames = list(data[0].keys())
- writer = csv.DictWriter(file, fieldnames=fieldnames)
- writer.writeheader()
- for row in data:
- writer.writerow(row)
-
def export_json(file, data):
def serialize(obj):
if isinstance(obj, Decimal):
@@ -250,6 +243,13 @@ def export_json(file, data):
json.dump(data, file, default=serialize, indent=2)
+def export_csv(file, data):
+ fieldnames = list(data[0].keys())
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
+ writer.writeheader()
+ for row in data:
+ writer.writerow(row)
+
listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
listing_soup = download(listing_url)
@@ -267,11 +267,11 @@ for product in listing_soup.select(".product-item"):
item["variant_name"] = None
data.append(item)
-with open("products.csv", "w") as file:
- export_csv(file, data)
-
with open("products.json", "w") as file:
export_json(file, data)
+
+with open("products.csv", "w") as file:
+ export_csv(file, data)
```
Let's run the scraper and see if all the items in the data contain prices: