From 0c087a7feb9c1a9c12a33df89c777b802c542afb Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 16 May 2025 12:16:03 +0200 Subject: [PATCH 01/26] fix: update intro, lesson titles, and descriptions to mention JS --- .../04_downloading_html.md | 6 +++--- .../05_parsing_html.md | 6 +++--- .../06_locating_elements.md | 6 +++--- .../07_extracting_data.md | 4 ++-- .../08_saving_data.md | 6 +++--- .../09_getting_links.md | 6 +++--- .../scraping_basics_javascript2/10_crawling.md | 6 +++--- .../11_scraping_variants.md | 4 ++-- .../scraping_basics_javascript2/12_framework.md | 4 ++-- .../scraping_basics_javascript2/13_platform.md | 4 ++-- .../scraping_basics_javascript2/index.md | 16 ++++++++-------- 11 files changed, 34 insertions(+), 34 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md b/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md index ec361214f..1c2f53651 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md @@ -1,14 +1,14 @@ --- -title: Downloading HTML with Python +title: Downloading HTML with Node.js sidebar_label: Downloading HTML -description: Lesson about building a Python application for watching prices. Using the HTTPX library to download HTML code of a product listing page. +description: Lesson about building a Node.js application for watching prices. Using the /TBD/ library to download HTML code of a product listing page. slug: /scraping-basics-javascript2/downloading-html unlisted: true --- import Exercises from './_exercises.mdx'; -**In this lesson we'll start building a Python application for watching prices. As a first step, we'll use the HTTPX library to download HTML code of a product listing page.** +**In this lesson we'll start building a Node.js application for watching prices. As a first step, we'll use the /TBD/ library to download HTML code of a product listing page.** --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md index 81aaf6778..3a5de6368 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md @@ -1,14 +1,14 @@ --- -title: Parsing HTML with Python +title: Parsing HTML with Node.js sidebar_label: Parsing HTML -description: Lesson about building a Python application for watching prices. Using the Beautiful Soup library to parse HTML code of a product listing page. +description: Lesson about building a Node.js application for watching prices. Using the /TBD/ library to parse HTML code of a product listing page. slug: /scraping-basics-javascript2/parsing-html unlisted: true --- import Exercises from './_exercises.mdx'; -**In this lesson we'll look for products in the downloaded HTML. We'll use BeautifulSoup to turn the HTML into objects which we can work with in our Python program.** +**In this lesson we'll look for products in the downloaded HTML. We'll use /TBD/ to turn the HTML into objects which we can work with in our Node.js program.** --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md index ef85a2612..8fd29410f 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md @@ -1,14 +1,14 @@ --- -title: Locating HTML elements with Python +title: Locating HTML elements with Node.js sidebar_label: Locating HTML elements -description: Lesson about building a Python application for watching prices. Using the Beautiful Soup library to locate products on the product listing page. +description: Lesson about building a Node.js application for watching prices. Using the /TBD/ library to locate products on the product listing page. slug: /scraping-basics-javascript2/locating-elements unlisted: true --- import Exercises from './_exercises.mdx'; -**In this lesson we'll locate product data in the downloaded HTML. We'll use BeautifulSoup to find those HTML elements which contain details about each product, such as title or price.** +**In this lesson we'll locate product data in the downloaded HTML. We'll use /TBD/ to find those HTML elements which contain details about each product, such as title or price.** --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md b/sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md index 81a375dc5..5a156a682 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md @@ -1,7 +1,7 @@ --- -title: Extracting data from HTML with Python +title: Extracting data from HTML with Node.js sidebar_label: Extracting data from HTML -description: Lesson about building a Python application for watching prices. Using string manipulation to extract and clean data scraped from the product listing page. +description: Lesson about building a Node.js application for watching prices. Using string manipulation to extract and clean data scraped from the product listing page. slug: /scraping-basics-javascript2/extracting-data unlisted: true --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md index b2c027a8c..ba3871f05 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md @@ -1,12 +1,12 @@ --- -title: Saving data with Python +title: Saving data with Node.js sidebar_label: Saving data -description: Lesson about building a Python application for watching prices. Using standard library to save data scraped from product listing pages in popular formats such as CSV or JSON. +description: Lesson about building a Node.js application for watching prices. Using /TBD/ to save data scraped from product listing pages in popular formats such as CSV or JSON. slug: /scraping-basics-javascript2/saving-data unlisted: true --- -**In this lesson, we'll save the data we scraped in the popular formats, such as CSV or JSON. We'll use Python's standard library to export the files.** +**In this lesson, we'll save the data we scraped in the popular formats, such as CSV or JSON. We'll use /TBD/ to export the files.** --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md index 76b083342..d71d9cd4d 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md @@ -1,14 +1,14 @@ --- -title: Getting links from HTML with Python +title: Getting links from HTML with Node.js sidebar_label: Getting links from HTML -description: Lesson about building a Python application for watching prices. Using the Beautiful Soup library to locate links to individual product pages. +description: Lesson about building a Node.js application for watching prices. Using the /TBD/ library to locate links to individual product pages. slug: /scraping-basics-javascript2/getting-links unlisted: true --- import Exercises from './_exercises.mdx'; -**In this lesson, we'll locate and extract links to individual product pages. We'll use BeautifulSoup to find the relevant bits of HTML.** +**In this lesson, we'll locate and extract links to individual product pages. We'll use /TBD/ to find the relevant bits of HTML.** --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md index 88f0c023f..c0ece522f 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md @@ -1,14 +1,14 @@ --- -title: Crawling websites with Python +title: Crawling websites with Node.js sidebar_label: Crawling websites -description: Lesson about building a Python application for watching prices. Using the HTTPX library to follow links to individual product pages. +description: Lesson about building a Node.js application for watching prices. Using the /TBD/ library to follow links to individual product pages. slug: /scraping-basics-javascript2/crawling unlisted: true --- import Exercises from './_exercises.mdx'; -**In this lesson, we'll follow links to individual product pages. We'll use HTTPX to download them and BeautifulSoup to process them.** +**In this lesson, we'll follow links to individual product pages. We'll use /TBD/ to download them and /TBD/ to process them.** --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md index 7c67de5f2..6cebba658 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md @@ -1,7 +1,7 @@ --- -title: Scraping product variants with Python +title: Scraping product variants with Node.js sidebar_label: Scraping product variants -description: Lesson about building a Python application for watching prices. Using browser DevTools to figure out how to extract product variants and exporting them as separate items. +description: Lesson about building a Node.js application for watching prices. Using browser DevTools to figure out how to extract product variants and exporting them as separate items. slug: /scraping-basics-javascript2/scraping-variants unlisted: true --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/12_framework.md b/sources/academy/webscraping/scraping_basics_javascript2/12_framework.md index 3cf1f02c7..fe80fb5fc 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/12_framework.md @@ -1,7 +1,7 @@ --- -title: Using a scraping framework with Python +title: Using a scraping framework with Node.js sidebar_label: Using a framework -description: Lesson about building a Python application for watching prices. Using the Crawlee framework to simplify creating a scraper. +description: Lesson about building a Node.js application for watching prices. Using the Crawlee framework to simplify creating a scraper. slug: /scraping-basics-javascript2/framework unlisted: true --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/13_platform.md b/sources/academy/webscraping/scraping_basics_javascript2/13_platform.md index e1bb36f3f..475f36a17 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/13_platform.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/13_platform.md @@ -1,7 +1,7 @@ --- -title: Using a scraping platform with Python +title: Using a scraping platform with Node.js sidebar_label: Using a platform -description: Lesson about building a Python application for watching prices. Using the Apify platform to deploy a scraper. +description: Lesson about building a Node.js application for watching prices. Using the Apify platform to deploy a scraper. slug: /scraping-basics-javascript2/platform unlisted: true --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/index.md b/sources/academy/webscraping/scraping_basics_javascript2/index.md index 03c7dde99..1158f717b 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/index.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/index.md @@ -9,32 +9,32 @@ unlisted: true import DocCardList from '@theme/DocCardList'; -**Learn how to use Python to extract information from websites in this practical course, starting from the absolute basics.** +**Learn how to use JavaScript to extract information from websites in this practical course, starting from the absolute basics.** --- -In this course we'll use Python to create an application for watching prices. It'll be able to scrape all product pages of an e-commerce website and record prices. Data from several runs of such program would be useful for seeing trends in price changes, detecting discounts, etc. +In this course we'll use JavaScript to create an application for watching prices. It'll be able to scrape all product pages of an e-commerce website and record prices. Data from several runs of such program would be useful for seeing trends in price changes, detecting discounts, etc. ![E-commerce listing on the left, JSON with data on the right](./images/scraping.webp) ## What we'll do - Inspect pages using browser DevTools. -- Download web pages using the HTTPX library. -- Extract data from web pages using the Beautiful Soup library. +- Download web pages using the /TBD/ library. +- Extract data from web pages using the /TBD/ library. - Save extracted data in various formats, e.g. CSV which MS Excel or Google Sheets can open. - Follow links programmatically (crawling). - Save time and effort with frameworks, such as Crawlee, and scraping platforms, such as Apify. ## Who this course is for -Anyone with basic knowledge of developing programs in Python who wants to start with web scraping can take this course. The course does not expect you to have any prior knowledge of web technologies or scraping. +Anyone with basic knowledge of developing programs in JavaScript who wants to start with web scraping can take this course. The course does not expect you to have any prior knowledge of web technologies or scraping. ## Requirements -- A macOS, Linux, or Windows machine with a web browser and Python installed. -- Familiarity with Python basics: variables, conditions, loops, functions, strings, lists, dictionaries, files, classes, and exceptions. -- Comfort with importing from the Python standard library, using virtual environments, and installing dependencies with `pip`. +- A macOS, Linux, or Windows machine with a web browser and Node.js installed. +- Familiarity with JavaScript basics: variables, conditions, loops, functions, strings, lists, dictionaries, files, classes, and exceptions. +- Comfort with building a Node.js package and installing dependencies with `npm`. - Familiarity with running commands in Terminal (macOS/Linux) or Command Prompt (Windows). ## You may want to know From 5cfd912c323e01fad2f3832bbb9d7e93e530b65f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 16 May 2025 13:08:29 +0200 Subject: [PATCH 02/26] feat: decide about the technologies --- .../scraping_basics_javascript2/04_downloading_html.md | 4 ++-- .../scraping_basics_javascript2/05_parsing_html.md | 4 ++-- .../scraping_basics_javascript2/06_locating_elements.md | 4 ++-- .../scraping_basics_javascript2/08_saving_data.md | 4 ++-- .../scraping_basics_javascript2/09_getting_links.md | 4 ++-- .../webscraping/scraping_basics_javascript2/10_crawling.md | 4 ++-- .../webscraping/scraping_basics_javascript2/index.md | 6 +++--- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md b/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md index 1c2f53651..44d582e5f 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md @@ -1,14 +1,14 @@ --- title: Downloading HTML with Node.js sidebar_label: Downloading HTML -description: Lesson about building a Node.js application for watching prices. Using the /TBD/ library to download HTML code of a product listing page. +description: Lesson about building a Node.js application for watching prices. Using the Fetch API to download HTML code of a product listing page. slug: /scraping-basics-javascript2/downloading-html unlisted: true --- import Exercises from './_exercises.mdx'; -**In this lesson we'll start building a Node.js application for watching prices. As a first step, we'll use the /TBD/ library to download HTML code of a product listing page.** +**In this lesson we'll start building a Node.js application for watching prices. As a first step, we'll use the Fetch API to download HTML code of a product listing page.** --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md index 3a5de6368..6f96ed2c7 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md @@ -1,14 +1,14 @@ --- title: Parsing HTML with Node.js sidebar_label: Parsing HTML -description: Lesson about building a Node.js application for watching prices. Using the /TBD/ library to parse HTML code of a product listing page. +description: Lesson about building a Node.js application for watching prices. Using the Cheerio library to parse HTML code of a product listing page. slug: /scraping-basics-javascript2/parsing-html unlisted: true --- import Exercises from './_exercises.mdx'; -**In this lesson we'll look for products in the downloaded HTML. We'll use /TBD/ to turn the HTML into objects which we can work with in our Node.js program.** +**In this lesson we'll look for products in the downloaded HTML. We'll use Cheerio to turn the HTML into objects which we can work with in our Node.js program.** --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md index 8fd29410f..2aa3100e7 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md @@ -1,14 +1,14 @@ --- title: Locating HTML elements with Node.js sidebar_label: Locating HTML elements -description: Lesson about building a Node.js application for watching prices. Using the /TBD/ library to locate products on the product listing page. +description: Lesson about building a Node.js application for watching prices. Using the Cheerio library to locate products on the product listing page. slug: /scraping-basics-javascript2/locating-elements unlisted: true --- import Exercises from './_exercises.mdx'; -**In this lesson we'll locate product data in the downloaded HTML. We'll use /TBD/ to find those HTML elements which contain details about each product, such as title or price.** +**In this lesson we'll locate product data in the downloaded HTML. We'll use Cheerio to find those HTML elements which contain details about each product, such as title or price.** --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md index ba3871f05..e1ad7365a 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md @@ -1,12 +1,12 @@ --- title: Saving data with Node.js sidebar_label: Saving data -description: Lesson about building a Node.js application for watching prices. Using /TBD/ to save data scraped from product listing pages in popular formats such as CSV or JSON. +description: Lesson about building a Node.js application for watching prices. Using the json2csv library to save data scraped from product listing pages in both JSON and CSV. slug: /scraping-basics-javascript2/saving-data unlisted: true --- -**In this lesson, we'll save the data we scraped in the popular formats, such as CSV or JSON. We'll use /TBD/ to export the files.** +**In this lesson, we'll save the data we scraped in the popular formats, such as CSV or JSON. We'll use the json2csv library to export the files.** --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md index d71d9cd4d..1eebf4cbc 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md @@ -1,14 +1,14 @@ --- title: Getting links from HTML with Node.js sidebar_label: Getting links from HTML -description: Lesson about building a Node.js application for watching prices. Using the /TBD/ library to locate links to individual product pages. +description: Lesson about building a Node.js application for watching prices. Using the Cheerio library to locate links to individual product pages. slug: /scraping-basics-javascript2/getting-links unlisted: true --- import Exercises from './_exercises.mdx'; -**In this lesson, we'll locate and extract links to individual product pages. We'll use /TBD/ to find the relevant bits of HTML.** +**In this lesson, we'll locate and extract links to individual product pages. We'll use Cheerio to find the relevant bits of HTML.** --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md index c0ece522f..98d47b54e 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md @@ -1,14 +1,14 @@ --- title: Crawling websites with Node.js sidebar_label: Crawling websites -description: Lesson about building a Node.js application for watching prices. Using the /TBD/ library to follow links to individual product pages. +description: Lesson about building a Node.js application for watching prices. Using the Fetch API to follow links to individual product pages. slug: /scraping-basics-javascript2/crawling unlisted: true --- import Exercises from './_exercises.mdx'; -**In this lesson, we'll follow links to individual product pages. We'll use /TBD/ to download them and /TBD/ to process them.** +**In this lesson, we'll follow links to individual product pages. We'll use the Fetch API to download them and Cheerio to process them.** --- diff --git a/sources/academy/webscraping/scraping_basics_javascript2/index.md b/sources/academy/webscraping/scraping_basics_javascript2/index.md index 1158f717b..c3d9893a1 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/index.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/index.md @@ -20,9 +20,9 @@ In this course we'll use JavaScript to create an application for watching prices ## What we'll do - Inspect pages using browser DevTools. -- Download web pages using the /TBD/ library. -- Extract data from web pages using the /TBD/ library. -- Save extracted data in various formats, e.g. CSV which MS Excel or Google Sheets can open. +- Download web pages using the Fetch API. +- Extract data from web pages using the Cheerio library. +- Save extracted data in various formats (e.g. CSV which MS Excel or Google Sheets can open) using the json2csv library. - Follow links programmatically (crawling). - Save time and effort with frameworks, such as Crawlee, and scraping platforms, such as Apify. From 2ec5a8397fc9d94e5a88939be86c9e46034ac5de Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 16 May 2025 15:30:53 +0200 Subject: [PATCH 03/26] fix: update intro to be about JS --- .../academy/webscraping/scraping_basics_javascript2/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/index.md b/sources/academy/webscraping/scraping_basics_javascript2/index.md index c3d9893a1..c7dcb96b5 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/index.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/index.md @@ -28,7 +28,7 @@ In this course we'll use JavaScript to create an application for watching prices ## Who this course is for -Anyone with basic knowledge of developing programs in JavaScript who wants to start with web scraping can take this course. The course does not expect you to have any prior knowledge of web technologies or scraping. +Anyone with basic knowledge of developing programs in JavaScript who wants to start with web scraping can take this course. The course does not expect you to have any prior knowledge of other web technologies or scraping. ## Requirements From 27ba023b289e89caf436741128d6d3761963dd8a Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 16 May 2025 15:31:08 +0200 Subject: [PATCH 04/26] fix: update devtools 1 to be about JS --- .../01_devtools_inspecting.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/01_devtools_inspecting.md b/sources/academy/webscraping/scraping_basics_javascript2/01_devtools_inspecting.md index e0e699c5e..75cd2bbe7 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/01_devtools_inspecting.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/01_devtools_inspecting.md @@ -30,7 +30,7 @@ Now let's peek behind the scenes of a real-world website—say, Wikipedia. We'll ![Wikipedia with Chrome DevTools open](./images/devtools-wikipedia.png) -Websites are built with three main technologies: HTML, CSS, and JavaScript. In the **Elements** tab, DevTools shows the HTML and CSS of the current page: +Apart from JavaScript, websites are built with two main technologies: HTML and CSS. In the **Elements** tab, DevTools shows the HTML and CSS of the current page: ![Elements tab in Chrome DevTools](./images/devtools-elements-tab.png) @@ -58,9 +58,9 @@ HTML, a markup language, describes how everything on a page is organized, how el } ``` -While HTML and CSS describe what the browser should display, [JavaScript](https://developer.mozilla.org/en-US/docs/Learn/JavaScript) is a general-purpose programming language that adds interaction to the page. +While HTML and CSS describe what the browser should display, JavaScript adds interaction to the page. In DevTools, the **Console** tab allows ad-hoc experimenting with JavaScript. -In DevTools, the **Console** tab allows ad-hoc experimenting with JavaScript. If you don't see it, press **ESC** to toggle the Console. Running commands in the Console lets us manipulate the loaded page—we’ll try this shortly. +If you don't see it, press **ESC** to toggle the Console. Running commands in the Console lets us manipulate the loaded page—we’ll try this shortly. ![Console in Chrome DevTools](./images/devtools-console.png) @@ -104,13 +104,13 @@ Encyclopedia ## Interacting with an element -We won't be creating Python scrapers just yet. Let's first get familiar with what we can do in the JavaScript console and how we can further interact with HTML elements on the page. +We won't be creating Node.js scrapers just yet. Let's first get familiar with what we can do in the DevTools console and how we can further interact with HTML elements on the page. In the **Elements** tab, with the subtitle element highlighted, let's right-click the element to open the context menu. There, we'll choose **Store as global variable**. The **Console** should appear, with a `temp1` variable ready. ![Global variable in Chrome DevTools Console](./images/devtools-console-variable.png) -The Console allows us to run JavaScript in the context of the loaded page, similar to Python's [interactive REPL](https://realpython.com/interacting-with-python/). We can use it to play around with elements. +The Console allows us to run code in the context of the loaded page. We can use it to play around with elements. For a start, let's access some of the subtitle's properties. One such property is `textContent`, which contains the text inside the HTML element. The last line in the Console is where your cursor is. We'll type the following and hit **Enter**: From 62ace76b1b1941edac82829d07252d3f4e38c027 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 16 May 2025 15:37:23 +0200 Subject: [PATCH 05/26] fix: update devtools 2 to be about JS --- .../02_devtools_locating_elements.md | 8 +++----- .../02_devtools_locating_elements.md | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/02_devtools_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript2/02_devtools_locating_elements.md index 1b65814a3..f148552fc 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/02_devtools_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/02_devtools_locating_elements.md @@ -56,9 +56,7 @@ The `class` attribute can hold multiple values separated by whitespace. This par ## Programmatically locating a product card -Let's jump into the **Console** and write some JavaScript. Don't worry—we don't need to know the language, and yes, this is a helpful step on our journey to creating a scraper in Python. - -In browsers, JavaScript represents the current page as the [`Document`](https://developer.mozilla.org/en-US/docs/Web/API/Document) object, accessible via `document`. This object offers many useful methods, including [`querySelector()`](https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector). This method takes a CSS selector as a string and returns the first HTML element that matches. We'll try typing this into the **Console**: +Let's jump into the **Console** and write some code. In browsers, JavaScript represents the current page as the [`Document`](https://developer.mozilla.org/en-US/docs/Web/API/Document) object, accessible via `document`. This object offers many useful methods, including [`querySelector()`](https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector). This method takes a CSS selector as a string and returns the first HTML element that matches. We'll try typing this into the **Console**: ```js document.querySelector('.product-item'); @@ -136,14 +134,14 @@ We'll expand the result by clicking the small arrow, then hover our cursor over ![Highlighting a querySelectorAll() result](./images/devtools-hover-queryselectorall.png) -To save the subwoofer in a variable for further inspection, we can use index access with brackets, just like in Python lists (or JavaScript arrays): +To save the subwoofer in a variable for further inspection, we can use index access with brackets, just like with regular JavaScript arrays: ```js products = document.querySelectorAll('.product-item'); subwoofer = products[2]; ``` -Even though we're just playing with JavaScript in the browser's **Console**, we're inching closer to figuring out what our Python program will need to do. In the next lesson, we'll dive into accessing child elements and extracting product details. +Even though we're just playing in the browser's **Console**, we're inching closer to figuring out what our Node.js program will need to do. In the next lesson, we'll dive into accessing child elements and extracting product details. --- diff --git a/sources/academy/webscraping/scraping_basics_python/02_devtools_locating_elements.md b/sources/academy/webscraping/scraping_basics_python/02_devtools_locating_elements.md index d9f46a0c3..3a77ec607 100644 --- a/sources/academy/webscraping/scraping_basics_python/02_devtools_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_python/02_devtools_locating_elements.md @@ -135,7 +135,7 @@ We'll expand the result by clicking the small arrow, then hover our cursor over ![Highlighting a querySelectorAll() result](./images/devtools-hover-queryselectorall.png) -To save the subwoofer in a variable for further inspection, we can use index access with brackets, just like in Python lists (or JavaScript arrays): +To save the subwoofer in a variable for further inspection, we can use index access with brackets, just like with Python lists (or JavaScript arrays): ```js products = document.querySelectorAll('.product-item'); From 75121bc3b9f42b1c1443ab90940f07b99ae4353d Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 16 May 2025 15:43:19 +0200 Subject: [PATCH 06/26] fix: update devtools 3 to be about JS --- .../03_devtools_extracting_data.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/03_devtools_extracting_data.md b/sources/academy/webscraping/scraping_basics_javascript2/03_devtools_extracting_data.md index 730089bb2..aeb6fc7ed 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/03_devtools_extracting_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/03_devtools_extracting_data.md @@ -41,7 +41,7 @@ We'll use the **Elements** tab of DevTools to inspect all child elements of the ![Finding child elements](./images/devtools-product-details.png) -JavaScript represents HTML elements as [Element](https://developer.mozilla.org/en-US/docs/Web/API/Element) objects. Among properties we've already played with, such as `textContent` or `outerHTML`, it also has the [`querySelector()`](https://developer.mozilla.org/en-US/docs/Web/API/Element/querySelector) method. Here the method looks for matches only within children of the element: +Browser JavaScript represents HTML elements as [Element](https://developer.mozilla.org/en-US/docs/Web/API/Element) objects. Among properties we've already played with, such as `textContent` or `outerHTML`, it also has the [`querySelector()`](https://developer.mozilla.org/en-US/docs/Web/API/Element/querySelector) method. Here the method looks for matches only within children of the element: ```js title = subwoofer.querySelector('.product-item__title'); @@ -69,9 +69,9 @@ It works, but the price isn't alone in the result. Before we'd use such data, we ![Extracting product price](./images/devtools-extracting-price.png) -But for now that's okay. We're just testing the waters now, so that we have an idea about what our scraper will need to do. Once we'll get to extracting prices in Python, we'll figure out how to get the values as numbers. +But for now that's okay. We're just testing the waters now, so that we have an idea about what our scraper will need to do. Once we'll get to extracting prices in Node.js, we'll figure out how to get the values as numbers. -In the next lesson, we'll start with our Python project. First we'll be figuring out how to download the Sales page without browser and make it accessible in a Python program. +In the next lesson, we'll start with our Node.js project. First we'll be figuring out how to download the Sales page without browser and make it accessible in a Node.js program. --- @@ -79,7 +79,7 @@ In the next lesson, we'll start with our Python project. First we'll be figuring ### Extract the price of IKEA's most expensive artificial plant -At IKEA's [Artificial plants & flowers listing](https://www.ikea.com/se/en/cat/artificial-plants-flowers-20492/), use CSS selectors and HTML elements manipulation in the **Console** to extract the price of the most expensive artificial plant (sold in Sweden, as you'll be browsing their Swedish offer). Before opening DevTools, use your judgment to adjust the page to make the task as straightforward as possible. Finally, use JavaScript's [`parseInt()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/parseInt) function to convert the price text into a number. +At IKEA's [Artificial plants & flowers listing](https://www.ikea.com/se/en/cat/artificial-plants-flowers-20492/), use CSS selectors and HTML elements manipulation in the **Console** to extract the price of the most expensive artificial plant (sold in Sweden, as you'll be browsing their Swedish offer). Before opening DevTools, use your judgment to adjust the page to make the task as straightforward as possible. Finally, use the [`parseInt()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/parseInt) function to convert the price text into a number.
Solution @@ -98,7 +98,7 @@ At IKEA's [Artificial plants & flowers listing](https://www.ikea.com/se/en/cat/a ### Extract the name of the top wiki on Fandom Movies -On Fandom's [Movies page](https://www.fandom.com/topics/movies), use CSS selectors and HTML element manipulation in the **Console** to extract the name of the top wiki. Use JavaScript's [`trim()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/trim) method to remove white space around the name. +On Fandom's [Movies page](https://www.fandom.com/topics/movies), use CSS selectors and HTML element manipulation in the **Console** to extract the name of the top wiki. Use the [`trim()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/trim) method to remove white space around the name. ![Fandom's Movies page](./images/devtools-exercise-fandom.png) From 8f107d64f1fcc51e1f9ef907d84f2170526a86dc Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 23 Jun 2025 11:44:27 +0200 Subject: [PATCH 07/26] fix: update downloading to be about JS --- .../04_downloading_html.md | 170 +++++++++++------- .../05_parsing_html.md | 14 ++ 2 files changed, 116 insertions(+), 68 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md b/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md index 44d582e5f..8c82662f7 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md @@ -12,61 +12,83 @@ import Exercises from './_exercises.mdx'; --- -Using browser tools for developers is crucial for understanding the structure of a particular page, but it's a manual task. Let's start building our first automation, a Python program which downloads HTML code of the product listing. +Using browser tools for developers is crucial for understanding the structure of a particular page, but it's a manual task. Let's start building our first automation, a JavaScript program which downloads HTML code of the product listing. -## Starting a Python project +## Starting a Node.js project -Before we start coding, we need to set up a Python project. Let's create new directory with a virtual environment. Inside the directory and with the environment activated, we'll install the HTTPX library: +Before we start coding, we need to set up a Node.js project. Let's create new directory and let's name it `product-scraper`. Inside the directory, we'll initialize new project: ```text -$ pip install httpx +$ npm init +This utility will walk you through creating a package.json file. ... -Successfully installed ... httpx-0.0.0 -``` - -:::tip Installing packages - -Being comfortable around Python project setup and installing packages is a prerequisite of this course, but if you wouldn't say no to a recap, we recommend the [Installing Packages](https://packaging.python.org/en/latest/tutorials/installing-packages/) tutorial from the official Python Packaging User Guide. -::: +Press ^C at any time to quit. +package name: (product-scraper) +version: (1.0.0) +description: Product scraper +entry point: (index.js) +test command: +git repository: +keywords: +author: +license: (ISC) +# highlight-next-line +type: (commonjs) module +About to write to /Users/.../product-scraper/package.json: + +{ + "name": "product-scraper", + "version": "1.0.0", + "description": "Product scraper", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "", + "license": "ISC", +# highlight-next-line + "type": "module" +} +``` -Now let's test that all works. Inside the project directory we'll create a new file called `main.py` with the following code: +The above creates a `package.json` file with configuration of our project. While most of the values are arbitrary, it's important that the project's type is set to `module`. Now let's test that all works. Inside the project directory we'll create a new file called `index.js` with the following code: -```py -import httpx +```js +import process from 'node:process'; -print("OK") +console.log(`All is OK, ${process.argv[2]}`); ``` -Running it as a Python program will verify that our setup is okay and we've installed HTTPX: +Running it as a Node.js program will verify that our setup is okay and we've correctly set the type to `module`. The program takes a single word as an argument and will address us with it, so let's pass it "mate", for example: ```text -$ python main.py -OK +$ node index.js mate +All is OK, mate ``` :::info Troubleshooting -If you see errors or for any other reason cannot run the code above, it means that your environment isn't set up correctly. We're sorry, but figuring out the issue is out of scope of this course. +If you see `ReferenceError: require is not defined in ES module scope, you can use import instead`, double check that in your `package.json` the type property is set to `module`. + +If you see other errors or for any other reason cannot run the code above, it means that your environment isn't set up correctly. We're sorry, but figuring out the issue is out of scope of this course. ::: ## Downloading product listing -Now onto coding! Let's change our code so it downloads HTML of the product listing instead of printing `OK`. The [documentation of the HTTPX library](https://www.python-httpx.org/) provides us with examples how to use it. Inspired by those, our code will look like this: - -```py -import httpx +Now onto coding! Let's change our code so it downloads HTML of the product listing instead of printing `All is OK`. The [documentation of the Fetch API](https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API/Using_Fetch) provides us with examples how to use it. Inspired by those, our code will look like this: -url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -response = httpx.get(url) -print(response.text) +```js +const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; +const response = await fetch(url); +console.log(await response.text()); ``` If we run the program now, it should print the downloaded HTML: ```text -$ python main.py +$ node index.js @@ -80,7 +102,7 @@ $ python main.py ``` -Running `httpx.get(url)`, we made a HTTP request and received a response. It's not particularly useful yet, but it's a good start of our scraper. +Running `await fetch(url)`, we made a HTTP request and received a response. It's not particularly useful yet, but it's a good start of our scraper. :::tip Client and server, request and response @@ -88,7 +110,7 @@ HTTP is a network protocol powering the internet. Understanding it well is an im - HTTP is an exchange between two participants. - The _client_ sends a _request_ to the _server_, which replies with a _response_. -- In our case, `main.py` is the client, and the technology running at `warehouse-theme-metal.myshopify.com` replies to our request as the server. +- In our case, `index.js` is the client, and the technology running at `warehouse-theme-metal.myshopify.com` replies to our request as the server. ::: @@ -110,28 +132,30 @@ First, let's ask for trouble. We'll change the URL in our code to a page that do https://warehouse-theme-metal.myshopify.com/does/not/exist ``` -We could check the value of `response.status_code` against a list of allowed numbers, but HTTPX already provides `response.raise_for_status()`, a method that analyzes the number and raises the `httpx.HTTPError` exception if our request wasn't successful: +We could check the value of `response.status` against a list of allowed numbers, but the Fetch API already provides `response.ok`, a property which returns `false` if our request wasn't successful: -```py -import httpx +```js +const url = "https://warehouse-theme-metal.myshopify.com/does/not/exist"; +const response = await fetch(url); -url = "https://warehouse-theme-metal.myshopify.com/does/not/exist" -response = httpx.get(url) -response.raise_for_status() -print(response.text) +if (response.ok) { + console.log(await response.text()); +} else { + throw new Error(`HTTP ${response.status}`); +} ``` If you run the code above, the program should crash: ```text -$ python main.py -Traceback (most recent call last): - File "/Users/.../main.py", line 5, in - response.raise_for_status() - File "/Users/.../.venv/lib/python3/site-packages/httpx/_models.py", line 761, in raise_for_status - raise HTTPStatusError(message, request=request, response=self) -httpx.HTTPStatusError: Client error '404 Not Found' for url 'https://warehouse-theme-metal.myshopify.com/does/not/exist' -For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404 +$ node index.js +file:///Users/.../index.js:7 + throw new Error(`HTTP ${response.status}`); + ^ + +Error: HTTP 404 + at file:///Users/.../index.js:7:9 + at process.processTicksAndRejections (node:internal/process/task_queues:105:5) ``` Letting our program visibly crash on error is enough for our purposes. Now, let's return to our primary goal. In the next lesson, we'll be looking for a way to extract information about products from the downloaded HTML. @@ -151,13 +175,15 @@ https://www.aliexpress.com/w/wholesale-darth-vader.html
Solution - ```py - import httpx + ```js + const url = "https://www.aliexpress.com/w/wholesale-darth-vader.html"; + const response = await fetch(url); - url = "https://www.aliexpress.com/w/wholesale-darth-vader.html" - response = httpx.get(url) - response.raise_for_status() - print(response.text) + if (response.ok) { + console.log(await response.text()); + } else { + throw new Error(`HTTP ${response.status}`); + } ```
@@ -176,26 +202,30 @@ https://warehouse-theme-metal.myshopify.com/collections/sales Right in your Terminal or Command Prompt, you can create files by _redirecting output_ of command line programs: ```text - python main.py > products.html + node index.js > products.html ``` - If you want to use Python instead, it offers several ways how to create files. The solution below uses [pathlib](https://docs.python.org/3/library/pathlib.html): + If you want to use Node.js instead, it offers several ways how to create files. The solution below uses the [Promises API](https://nodejs.org/api/fs.html#promises-api): - ```py - import httpx - from pathlib import Path + ```js + import { writeFile } from 'node:fs/promises'; - url = "https://warehouse-theme-metal.myshopify.com/collections/sales" - response = httpx.get(url) - response.raise_for_status() - Path("products.html").write_text(response.text) + const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; + const response = await fetch(url); + + if (response.ok) { + const html = await response.text(); + await writeFile('products.html', html); + } else { + throw new Error(`HTTP ${response.status}`); + } ```
### Download an image as a file -Download a product image, then save it on your disk as a file. While HTML is _textual_ content, images are _binary_. You may want to scan through the [HTTPX QuickStart](https://www.python-httpx.org/quickstart/) for guidance. You can use this URL pointing to an image of a TV: +Download a product image, then save it on your disk as a file. While HTML is _textual_ content, images are _binary_. You may want to scan through the [Fetch API documentation](https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API/Using_Fetch#reading_the_response_body) for guidance. Especially check `Response.arrayBuffer()`. You can use this URL pointing to an image of a TV: ```text https://warehouse-theme-metal.myshopify.com/cdn/shop/products/sonyxbr55front_f72cc8ff-fcd6-4141-b9cc-e1320f867785.jpg @@ -204,16 +234,20 @@ https://warehouse-theme-metal.myshopify.com/cdn/shop/products/sonyxbr55front_f72
Solution - Python offers several ways how to create files. The solution below uses [pathlib](https://docs.python.org/3/library/pathlib.html): + Node.js offers several ways how to create files. The solution below uses [Promises API](https://nodejs.org/api/fs.html#promises-api): + + ```js + import { writeFile } from 'node:fs/promises'; - ```py - from pathlib import Path - import httpx + const url = "https://warehouse-theme-metal.myshopify.com/cdn/shop/products/sonyxbr55front_f72cc8ff-fcd6-4141-b9cc-e1320f867785.jpg"; + const response = await fetch(url); - url = "https://warehouse-theme-metal.myshopify.com/cdn/shop/products/sonyxbr55front_f72cc8ff-fcd6-4141-b9cc-e1320f867785.jpg" - response = httpx.get(url) - response.raise_for_status() - Path("tv.jpg").write_bytes(response.content) + if (response.ok) { + const buffer = Buffer.from(await response.arrayBuffer()); + await writeFile('tv.jpg', buffer); + } else { + throw new Error(`HTTP ${response.status}`); + } ```
diff --git a/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md index 6f96ed2c7..5492055fe 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md @@ -38,6 +38,20 @@ $ pip install beautifulsoup4 Successfully installed beautifulsoup4-4.0.0 soupsieve-0.0 ``` + + Now let's use it for parsing the HTML. The `BeautifulSoup` object allows us to work with the HTML elements in a structured way. As a demonstration, we'll first get the `

` element, which represents the main heading of the page. ![Element of the main heading](./images/h1.png) From 6cbd29d0b726b1578b809c322c61632fb14b1f14 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 24 Jun 2025 10:48:45 +0200 Subject: [PATCH 08/26] fix: update parsing to be about JS --- .../04_downloading_html.md | 18 +- .../05_parsing_html.md | 182 +++++++++++------- .../scraping_basics_javascript2/index.md | 2 +- .../scraping_basics_python/05_parsing_html.md | 22 +-- 4 files changed, 138 insertions(+), 86 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md b/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md index 8c82662f7..b89c93a26 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/04_downloading_html.md @@ -69,9 +69,15 @@ All is OK, mate :::info Troubleshooting -If you see `ReferenceError: require is not defined in ES module scope, you can use import instead`, double check that in your `package.json` the type property is set to `module`. +If you see errors or for any other reason cannot run the code above, it means that your environment isn't set up correctly. We're sorry, but figuring out the issue is out of scope of this course. -If you see other errors or for any other reason cannot run the code above, it means that your environment isn't set up correctly. We're sorry, but figuring out the issue is out of scope of this course. +Double check that in your `package.json` the type property is set to `module`, otherwise you'll get the following warning: + +```text +[MODULE_TYPELESS_PACKAGE_JSON] Warning: Module type of file:///Users/.../product-scraper/index.js is not specified and it doesn't parse as CommonJS. +Reparsing as ES module because module syntax was detected. This incurs a performance overhead. +To eliminate this warning, add "type": "module" to /Users/.../product-scraper/package.json. +``` ::: @@ -85,6 +91,12 @@ const response = await fetch(url); console.log(await response.text()); ``` +:::tip Asynchronous flow + +First time you see `await`? It's a modern syntax for working with promises. See the [JavaScript Asynchronous Programming and Callbacks](https://nodejs.org/en/learn/asynchronous-work/javascript-asynchronous-programming-and-callbacks) and [Discover Promises in Node.js](https://nodejs.org/en/learn/asynchronous-work/discover-promises-in-nodejs) tutorials in the official Node.js documentation for more. + +::: + If we run the program now, it should print the downloaded HTML: ```text @@ -225,7 +237,7 @@ https://warehouse-theme-metal.myshopify.com/collections/sales ### Download an image as a file -Download a product image, then save it on your disk as a file. While HTML is _textual_ content, images are _binary_. You may want to scan through the [Fetch API documentation](https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API/Using_Fetch#reading_the_response_body) for guidance. Especially check `Response.arrayBuffer()`. You can use this URL pointing to an image of a TV: +Download a product image, then save it on your disk as a file. While HTML is _textual_ content, images are _binary_. You may want to scan through the [Fetch API documentation](https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API/Using_Fetch#reading_the_response_body) and the [Writing files with Node.js](https://nodejs.org/en/learn/manipulating-files/writing-files-with-nodejs) tutorial for guidance. Especially check `Response.arrayBuffer()`. You can use this URL pointing to an image of a TV: ```text https://warehouse-theme-metal.myshopify.com/cdn/shop/products/sonyxbr55front_f72cc8ff-fcd6-4141-b9cc-e1320f867785.jpg diff --git a/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md index 5492055fe..3a39241cc 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md @@ -20,9 +20,9 @@ As a first step, let's try counting how many products are on the listing page. ## Processing HTML -After downloading, the entire HTML is available in our program as a string. We can print it to the screen or save it to a file, but not much more. However, since it's a string, could we use [string operations](https://docs.python.org/3/library/stdtypes.html#string-methods) or [regular expressions](https://docs.python.org/3/library/re.html) to count the products? +After downloading, the entire HTML is available in our program as a string. We can print it to the screen or save it to a file, but not much more. However, since it's a string, could we use [string operations](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#instance_methods) or [regular expressions](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions) to count the products? -While somewhat possible, such an approach is tedious, fragile, and unreliable. To work with HTML, we need a robust tool dedicated to the task: an _HTML parser_. It takes a text with HTML markup and turns it into a tree of Python objects. +While somewhat possible, such an approach is tedious, fragile, and unreliable. To work with HTML, we need a robust tool dedicated to the task: an _HTML parser_. It takes a text with HTML markup and turns it into a tree of JavaScript objects. :::info Why regex can't parse HTML @@ -30,152 +30,192 @@ While [Bobince's infamous StackOverflow answer](https://stackoverflow.com/a/1732 ::: -We'll choose [Beautiful Soup](https://beautiful-soup-4.readthedocs.io/) as our parser, as it's a popular library renowned for its ability to process even non-standard, broken markup. This is useful for scraping, because real-world websites often contain all sorts of errors and discrepancies. +We'll choose [Cheerio](https://cheerio.js.org/) as our parser, as it's a popular library which can process even non-standard, broken markup. This is useful for scraping, because real-world websites often contain all sorts of errors and discrepancies. In the project directory, we'll run the following to install the Cheerio package: ```text -$ pip install beautifulsoup4 +$ npm install cheerio + +added 23 packages, and audited 24 packages in 1s ... -Successfully installed beautifulsoup4-4.0.0 soupsieve-0.0 ``` - - -Now let's use it for parsing the HTML. The `BeautifulSoup` object allows us to work with the HTML elements in a structured way. As a demonstration, we'll first get the `

` element, which represents the main heading of the page. +Now let's import the package and use it for parsing the HTML. The `cheerio` module allows us to work with the HTML elements in a structured way. As a demonstration, we'll first get the `

` element, which represents the main heading of the page. ![Element of the main heading](./images/h1.png) We'll update our code to the following: -```py -import httpx -from bs4 import BeautifulSoup +```js +import * as cheerio from 'cheerio'; -url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -response = httpx.get(url) -response.raise_for_status() +const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; +const response = await fetch(url); -html_code = response.text -soup = BeautifulSoup(html_code, "html.parser") -print(soup.select("h1")) +if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + console.log($("h1")); +} else { + throw new Error(`HTTP ${response.status}`); +} ``` Then let's run the program: ```text -$ python main.py -[

Sales

] +$ node index.js +LoadedCheerio { + '0': Element { + parent: Element { ... }, + prev: Text { ... }, + next: Element { ... }, + startIndex: null, + endIndex: null, +# highlight-next-line + children: [ [Text] ], +# highlight-next-line + name: 'h1', + attribs: [Object: null prototype] { class: 'collection__title heading h1' }, + type: 'tag', + namespace: 'http://www.w3.org/1999/xhtml', + 'x-attribsNamespace': [Object: null prototype] { class: undefined }, + 'x-attribsPrefix': [Object: null prototype] { class: undefined } + }, + length: 1, + ... +} ``` -Our code lists all `h1` elements it can find on the page. It's the case that there's just one, so in the result we can see a list with a single item. What if we want to print just the text? Let's change the end of the program to the following: +Our code prints a Cheerio object. It's something like an array of all `h1` elements Cheerio can find in the HTML we gave it. It's the case that there's just one, so we can see only a single item in the container. + +The item has many properties, such as references to its parent or sibling elements, but most importantly, its name is `h1` and in the `children` property, it contains a single text element. Now let's print just the text. Let's change our program to the following: + +```js +import * as cheerio from 'cheerio'; -```py -headings = soup.select("h1") -first_heading = headings[0] -print(first_heading.text) +const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; +const response = await fetch(url); + +if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + // highlight-next-line + console.log($("h1").text()); +} else { + throw new Error(`HTTP ${response.status}`); +} ``` -If we run our scraper again, it prints the text of the first `h1` element: +Thanks to the nature of the Cheerio object we don't have to explicitly find the first element. If we call `.text()`, it automatically assumes we want to work with the first element in the collection. Thus, if we run our scraper again, it prints the text of the first `h1` element: ```text -$ python main.py +$ node index.js Sales ``` :::note Dynamic websites -The Warehouse returns full HTML in its initial response, but many other sites add content via JavaScript after the page loads or after user interaction. In such cases, what we see in DevTools may differ from `response.text` in Python. Learn how to handle these scenarios in our [API Scraping](../api_scraping/index.md) and [Puppeteer & Playwright](../puppeteer_playwright/index.md) courses. +The Warehouse returns full HTML in its initial response, but many other sites add some content after the page loads or after user interaction. In such cases, what we'd see in DevTools could differ from `await response.text()` in Node.js. Learn how to handle these scenarios in our [API Scraping](../api_scraping/index.md) and [Puppeteer & Playwright](../puppeteer_playwright/index.md) courses. ::: ## Using CSS selectors -Beautiful Soup's `.select()` method runs a _CSS selector_ against a parsed HTML document and returns all the matching elements. It's like calling `document.querySelectorAll()` in browser DevTools. +Cheerio's `$()` method runs a _CSS selector_ against a parsed HTML document and returns all the matching elements. It's like calling `document.querySelectorAll()` in browser DevTools. -Scanning through [usage examples](https://beautiful-soup-4.readthedocs.io/en/latest/#css-selectors) will help us to figure out code for counting the product cards: +Scanning through [usage examples](https://cheerio.js.org/docs/basics/selecting) will help us to figure out code for counting the product cards: -```py -import httpx -from bs4 import BeautifulSoup +```js +import * as cheerio from 'cheerio'; -url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -response = httpx.get(url) -response.raise_for_status() +const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; +const response = await fetch(url); -html_code = response.text -soup = BeautifulSoup(html_code, "html.parser") -products = soup.select(".product-item") -print(len(products)) +if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + // highlight-next-line + console.log($(".product-item").length); +} else { + throw new Error(`HTTP ${response.status}`); +} ``` -In CSS, `.product-item` selects all elements whose `class` attribute contains value `product-item`. We call `soup.select()` with the selector and get back a list of matching elements. Beautiful Soup handles all the complexity of understanding the HTML markup for us. On the last line, we use `len()` to count how many items there is in the list. +In CSS, `.product-item` selects all elements whose `class` attribute contains value `product-item`. We call `$()` with the selector and get back a container of matching elements. Cheerio handles all the complexity of understanding the HTML markup for us. Then we use `.length` to count how many items there is in the container. ```text -$ python main.py +$ node index.js 24 ``` That's it! We've managed to download a product listing, parse its HTML, and count how many products it contains. In the next lesson, we'll be looking for a way to extract detailed information about individual products. +:::info Cheerio and jQuery + +The Cheerio documentation frequently mentions something called jQuery. In the medieval days of the internet, when so-called Internet Explorers roamed the untamed plains of simple websites, developers created the first JavaScript frameworks to improve their crude tools and overcome the wild inconsistencies between browsers. Imagine a time when things like `document.querySelectorAll()` didn't even exist. jQuery was the most popular of these frameworks, granting great power to those who knew how to wield it. + +Cheerio was deliberately designed to mimic jQuery's interface. At the time, nearly everyone was familiar with it, and it felt like the most natural way to walk through HTML elements. jQuery was used in the browser, Cheerio in Node.js. But as time passed, jQuery gradually faded from relevance. In a twist of history, we now learn its syntax only to use Cheerio. + +::: + --- -### Scrape F1 teams +### Scrape F1 Academy teams -Print a total count of F1 teams listed on this page: +Print a total count of F1 Academy teams listed on this page: ```text -https://www.formula1.com/en/teams +https://www.f1academy.com/Racing-Series/Teams ```
Solution - ```py - import httpx - from bs4 import BeautifulSoup + ```js + import * as cheerio from 'cheerio'; - url = "https://www.formula1.com/en/teams" - response = httpx.get(url) - response.raise_for_status() + const url = "https://www.f1academy.com/Racing-Series/Teams"; + const response = await fetch(url); - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - print(len(soup.select(".group"))) + if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + console.log($(".teams-driver-item").length); + } else { + throw new Error(`HTTP ${response.status}`); + } ```
-### Scrape F1 drivers +### Scrape F1 Academy drivers -Use the same URL as in the previous exercise, but this time print a total count of F1 drivers. +Use the same URL as in the previous exercise, but this time print a total count of F1 Academy drivers.
Solution - ```py - import httpx - from bs4 import BeautifulSoup + ```js + import * as cheerio from 'cheerio'; - url = "https://www.formula1.com/en/teams" - response = httpx.get(url) - response.raise_for_status() + const url = "https://www.f1academy.com/Racing-Series/Teams"; + const response = await fetch(url); - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - print(len(soup.select(".f1-team-driver-name"))) + if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + console.log($(".driver").length); + } else { + throw new Error(`HTTP ${response.status}`); + } ```
diff --git a/sources/academy/webscraping/scraping_basics_javascript2/index.md b/sources/academy/webscraping/scraping_basics_javascript2/index.md index c7dcb96b5..3751f05ef 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/index.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/index.md @@ -33,7 +33,7 @@ Anyone with basic knowledge of developing programs in JavaScript who wants to st ## Requirements - A macOS, Linux, or Windows machine with a web browser and Node.js installed. -- Familiarity with JavaScript basics: variables, conditions, loops, functions, strings, lists, dictionaries, files, classes, and exceptions. +- Familiarity with JavaScript basics: variables, conditions, loops, functions, strings, arrays, objects, files, classes, promises, imports, and exceptions. - Comfort with building a Node.js package and installing dependencies with `npm`. - Familiarity with running commands in Terminal (macOS/Linux) or Command Prompt (Windows). diff --git a/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md index 8b90a5cf1..74c399b69 100644 --- a/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md @@ -63,7 +63,7 @@ $ python main.py [

Sales

] ``` -Our code lists all `h1` elements it can find on the page. It's the case that there's just one, so in the result we can see a list with a single item. What if we want to print just the text? Let's change the end of the program to the following: +Our code lists all `h1` elements it can find in the HTML we gave it. It's the case that there's just one, so in the result we can see a list with a single item. What if we want to print just the text? Let's change the end of the program to the following: ```py headings = soup.select("h1") @@ -80,7 +80,7 @@ Sales :::note Dynamic websites -The Warehouse returns full HTML in its initial response, but many other sites add content via JavaScript after the page loads or after user interaction. In such cases, what we see in DevTools may differ from `response.text` in Python. Learn how to handle these scenarios in our [API Scraping](../api_scraping/index.md) and [Puppeteer & Playwright](../puppeteer_playwright/index.md) courses. +The Warehouse returns full HTML in its initial response, but many other sites add some content after the page loads or after user interaction. In such cases, what we'd see in DevTools could differ from `response.text` in Python. Learn how to handle these scenarios in our [API Scraping](../api_scraping/index.md) and [Puppeteer & Playwright](../puppeteer_playwright/index.md) courses. ::: @@ -117,12 +117,12 @@ That's it! We've managed to download a product listing, parse its HTML, and coun -### Scrape F1 teams +### Scrape F1 Academy teams -Print a total count of F1 teams listed on this page: +Print a total count of F1 Academy teams listed on this page: ```text -https://www.formula1.com/en/teams +https://www.f1academy.com/Racing-Series/Teams ```
@@ -132,20 +132,20 @@ https://www.formula1.com/en/teams import httpx from bs4 import BeautifulSoup - url = "https://www.formula1.com/en/teams" + url = "https://www.f1academy.com/Racing-Series/Teams" response = httpx.get(url) response.raise_for_status() html_code = response.text soup = BeautifulSoup(html_code, "html.parser") - print(len(soup.select(".group"))) + print(len(soup.select(".teams-driver-item"))) ```
-### Scrape F1 drivers +### Scrape F1 Academy drivers -Use the same URL as in the previous exercise, but this time print a total count of F1 drivers. +Use the same URL as in the previous exercise, but this time print a total count of F1 Academy drivers.
Solution @@ -154,13 +154,13 @@ Use the same URL as in the previous exercise, but this time print a total count import httpx from bs4 import BeautifulSoup - url = "https://www.formula1.com/en/teams" + url = "https://www.f1academy.com/Racing-Series/Teams" response = httpx.get(url) response.raise_for_status() html_code = response.text soup = BeautifulSoup(html_code, "html.parser") - print(len(soup.select(".f1-team-driver-name"))) + print(len(soup.select(".driver"))) ```
From 87f5b96183421ab543265289086b25fabbdd14af Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 24 Jun 2025 12:12:14 +0200 Subject: [PATCH 09/26] fix: update locating to be about JS --- .../05_parsing_html.md | 6 +- .../06_locating_elements.md | 326 +++++++++--------- .../06_locating_elements.md | 2 +- 3 files changed, 171 insertions(+), 163 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md index 3a39241cc..3f00410e5 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md @@ -92,7 +92,7 @@ LoadedCheerio { } ``` -Our code prints a Cheerio object. It's something like an array of all `h1` elements Cheerio can find in the HTML we gave it. It's the case that there's just one, so we can see only a single item in the container. +Our code prints a Cheerio object. It's something like an array of all `h1` elements Cheerio can find in the HTML we gave it. It's the case that there's just one, so we can see only a single item in the selection. The item has many properties, such as references to its parent or sibling elements, but most importantly, its name is `h1` and in the `children` property, it contains a single text element. Now let's print just the text. Let's change our program to the following: @@ -112,7 +112,7 @@ if (response.ok) { } ``` -Thanks to the nature of the Cheerio object we don't have to explicitly find the first element. If we call `.text()`, it automatically assumes we want to work with the first element in the collection. Thus, if we run our scraper again, it prints the text of the first `h1` element: +Thanks to the nature of the Cheerio object we don't have to explicitly find the first element. Calling `.text()` combines texts of all elements in the selection. If we run our scraper again, it prints the text of the `h1` element: ```text $ node index.js @@ -147,7 +147,7 @@ if (response.ok) { } ``` -In CSS, `.product-item` selects all elements whose `class` attribute contains value `product-item`. We call `$()` with the selector and get back a container of matching elements. Cheerio handles all the complexity of understanding the HTML markup for us. Then we use `.length` to count how many items there is in the container. +In CSS, `.product-item` selects all elements whose `class` attribute contains value `product-item`. We call `$()` with the selector and get back matching elements. Cheerio handles all the complexity of understanding the HTML markup for us. Then we use `.length` to count how many items there is in the selection. ```text $ node index.js diff --git a/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md index 2aa3100e7..bbc5d0e62 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md @@ -12,46 +12,50 @@ import Exercises from './_exercises.mdx'; --- -In the previous lesson we've managed to print text of the page's main heading or count how many products are in the listing. Let's combine those two. What happens if we print `.text` for each product card? - -```py -import httpx -from bs4 import BeautifulSoup - -url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -response = httpx.get(url) -response.raise_for_status() - -html_code = response.text -soup = BeautifulSoup(html_code, "html.parser") - -for product in soup.select(".product-item"): - print(product.text) +In the previous lesson we've managed to print text of the page's main heading or count how many products are in the listing. Let's combine those two. What happens if we print `.text()` for each product card? + +```js +import * as cheerio from 'cheerio'; + +const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; +const response = await fetch(url); + +if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + // highlight-next-line + $(".product-item").each((i, element) => { + // highlight-next-line + console.log($(element).text()); + // highlight-next-line + }); +} else { + throw new Error(`HTTP ${response.status}`); +} ``` -Well, it definitely prints _something_… +We're using [`each()`](https://cheerio.js.org/docs/api/classes/Cheerio#each) to loop over the items in the Cheerio container. It calls given function for each of the elements, with two arguments. The first is an index (0, 1, 2…), and the second is the element being processed. -```text -$ python main.py -Save $25.00 +Cheerio requires us to wrap the element with `$()` again before we can work with it further, and then we call `.text()`. If we run the code, it… well, it definitely prints _something_… +```text +$ node index.js -JBL -JBL Flip 4 Waterproof Portable Bluetooth Speaker + JBL +JBL Flip 4 Waterproof Portable Bluetooth Speaker -Black -+7 + Black -Blue + +7 -+6 + Blue -Grey + +6 ... ``` @@ -65,84 +69,48 @@ As in the browser DevTools lessons, we need to change the code so that it locate We should be looking for elements which have the `product-item__title` and `price` classes. We already know how that translates to CSS selectors: -```py -import httpx -from bs4 import BeautifulSoup +```js +import * as cheerio from 'cheerio'; -url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -response = httpx.get(url) -response.raise_for_status() +const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; +const response = await fetch(url); -html_code = response.text -soup = BeautifulSoup(html_code, "html.parser") +if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); -for product in soup.select(".product-item"): - titles = product.select(".product-item__title") - first_title = titles[0].text + $(".product-item").each((i, element) => { + const productItem = $(element); - prices = product.select(".price") - first_price = prices[0].text + const title = productItem.find(".product-item__title"); + const titleText = title.text(); - print(first_title, first_price) + const price = productItem.find(".price"); + const priceText = price.text(); + + console.log(`${titleText} | ${priceText}`); + }); +} else { + throw new Error(`HTTP ${response.status}`); +} ``` Let's run the program now: ```text $ python main.py -JBL Flip 4 Waterproof Portable Bluetooth Speaker -Sale price$74.95 -Sony XBR-950G BRAVIA 4K HDR Ultra HD TV -Sale priceFrom $1,398.00 +JBL Flip 4 Waterproof Portable Bluetooth Speaker | + Sale price$74.95 +Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | + Sale priceFrom $1,398.00 ... ``` There's still some room for improvement, but it's already much better! -## Locating a single element - -Often, we want to assume in our code that a certain element exists only once. It's a bit tedious to work with lists when you know you're looking for a single element. For this purpose, Beautiful Soup offers the `.select_one()` method. Like `document.querySelector()` in browser DevTools, it returns just one result or `None`. Let's simplify our code! - -```py -import httpx -from bs4 import BeautifulSoup - -url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -response = httpx.get(url) -response.raise_for_status() - -html_code = response.text -soup = BeautifulSoup(html_code, "html.parser") - -for product in soup.select(".product-item"): - title = product.select_one(".product-item__title").text - price = product.select_one(".price").text - print(title, price) -``` - -This program does the same as the one we already had, but its code is more concise. - -:::note Fragile code - -We assume that the selectors we pass to the `select()` or `select_one()` methods return at least one element. If they don't, calling `[0]` on an empty list or `.text` on `None` would crash the program. If you perform type checking on your Python program, the code examples above will trigger warnings about this. - -Not handling these cases allows us to keep the code examples more succinct. Additionally, if we expect the selectors to return elements but they suddenly don't, it usually means the website has changed since we wrote our scraper. Letting the program crash in such cases is a valid way to notify ourselves that we need to fix it. - -::: - ## Precisely locating price -In the output we can see that the price isn't located precisely: - -```text -JBL Flip 4 Waterproof Portable Bluetooth Speaker -Sale price$74.95 -Sony XBR-950G BRAVIA 4K HDR Ultra HD TV -Sale priceFrom $1,398.00 -... -``` - -For each product, our scraper also prints the text `Sale price`. Let's look at the HTML structure again. Each bit containing the price looks like this: +In the output we can see that the price isn't located precisely. For each product, our scraper also prints the text `Sale price`. Let's look at the HTML structure again. Each bit containing the price looks like this: ```html @@ -151,58 +119,77 @@ For each product, our scraper also prints the text `Sale price`. Let's look at t ``` -When translated to a tree of Python objects, the element with class `price` will contain several _nodes_: +When translated to a tree of JavaScript objects, the element with class `price` will contain several _nodes_: - Textual node with white space, - a `span` HTML element, - a textual node representing the actual amount and possibly also white space. -We can use Beautiful Soup's `.contents` property to access individual nodes. It returns a list of nodes like this: +We can use Cheerio's [`.contents()`](https://cheerio.js.org/docs/api/classes/Cheerio#contents) method to access individual nodes. It returns a list of nodes like this: -```py -["\n", Sale price, "$74.95"] +```text +LoadedCheerio { + '0': Text { + parent: Element { ... }, + prev: null, + next: Element { ... }, + data: '\n ', + type: 'text' + }, + '1': Element { + parent: Element { ... }, + prev: Text { ... }, + next: Text { ... }, + children: [ [Text] ], + name: 'span', + type: 'tag', + ... + }, + '2': Text { + parent: Element { ... }, + prev: Element { ... }, + next: null, + data: '$74.95', + type: 'text' + }, + length: 3, + ... +} ``` -It seems like we can read the last element to get the actual amount from a list like the above. Let's fix our program: - -```py -import httpx -from bs4 import BeautifulSoup +It seems like we can read the last element to get the actual amount. Let's fix our program: -url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -response = httpx.get(url) -response.raise_for_status() +```js +import * as cheerio from 'cheerio'; -html_code = response.text -soup = BeautifulSoup(html_code, "html.parser") - -for product in soup.select(".product-item"): - title = product.select_one(".product-item__title").text - price = product.select_one(".price").contents[-1] - print(title, price) -``` +const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; +const response = await fetch(url); -If we run the scraper now, it should print prices as only amounts: +if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); -```text -$ python main.py -JBL Flip 4 Waterproof Portable Bluetooth Speaker $74.95 -Sony XBR-950G BRAVIA 4K HDR Ultra HD TV From $1,398.00 -... -``` + $(".product-item").each((i, element) => { + const productItem = $(element); -## Formatting output + const title = productItem.find(".product-item__title"); + const titleText = title.text(); -The results seem to be correct, but they're hard to verify because the prices visually blend with the titles. Let's set a different separator for the `print()` function: + // highlight-next-line + const price = productItem.find(".price").contents().last(); + const priceText = price.text(); -```py -print(title, price, sep=" | ") + console.log(`${titleText} | ${priceText}`); + }); +} else { + throw new Error(`HTTP ${response.status}`); +} ``` -The output is much nicer this way: +We're enjoying the fact that Cheerio selections provide utility methods for accessing items, such as [`.first()`](https://cheerio.js.org/docs/api/classes/Cheerio#first) or [`.last()`](https://cheerio.js.org/docs/api/classes/Cheerio#last). If we run the scraper now, it should print prices as only amounts: ```text -$ python main.py +$ node index.js JBL Flip 4 Waterproof Portable Bluetooth Speaker | $74.95 Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | From $1,398.00 ... @@ -216,7 +203,7 @@ Great! We have managed to use CSS selectors and walk the HTML tree to get a list ### Scrape Wikipedia -Download Wikipedia's page with the list of African countries, use Beautiful Soup to parse it, and print short English names of all the states and territories mentioned in all tables. This is the URL: +Download Wikipedia's page with the list of African countries, use Cheerio to parse it, and print short English names of all the states and territories mentioned in all tables. This is the URL: ```text https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa @@ -235,24 +222,36 @@ Botswana
Solution - ```py - import httpx - from bs4 import BeautifulSoup + ```js + import * as cheerio from 'cheerio'; + + const url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"; + const response = await fetch(url); - url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" - response = httpx.get(url) - response.raise_for_status() + if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") + $(".wikitable").each((i, tableElement) => { + const table = $(tableElement); + const rows = table.find("tr"); + + rows.each((j, rowElement) => { + const row = $(rowElement); + const cells = row.find("td"); + + if (cells.length > 0) { + const thirdColumn = $(cells[2]); + const link = thirdColumn.find("a").first(); + const linkText = link.text(); + console.log(linkText); + } + }); + }); + } else { + throw new Error(`HTTP ${response.status}`); + } - for table in soup.select(".wikitable"): - for row in table.select("tr"): - cells = row.select("td") - if cells: - third_column = cells[2] - title_link = third_column.select_one("a") - print(title_link.text) ``` Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells. @@ -269,26 +268,32 @@ Simplify the code from previous exercise. Use a single for loop and a single CSS
Solution - ```py - import httpx - from bs4 import BeautifulSoup - - url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - - for name_cell in soup.select(".wikitable tr td:nth-child(3)"): - print(name_cell.select_one("a").text) + ```js + import * as cheerio from 'cheerio'; + + const url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"; + const response = await fetch(url); + + if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + + $(".wikitable tr td:nth-child(3)").each((i, element) => { + const nameCell = $(element); + const link = nameCell.find("a").first(); + const linkText = link.text(); + console.log(linkText); + }); + } else { + throw new Error(`HTTP ${response.status}`); + } ```
### Scrape F1 news -Download Guardian's page with the latest F1 news, use Beautiful Soup to parse it, and print titles of all the listed articles. This is the URL: +Download Guardian's page with the latest F1 news, use Cheerio to parse it, and print titles of all the listed articles. This is the URL: ```text https://www.theguardian.com/sport/formulaone @@ -306,19 +311,22 @@ Max Verstappen wins Canadian Grand Prix: F1 – as it happened
Solution - ```py - import httpx - from bs4 import BeautifulSoup + ```js + import * as cheerio from 'cheerio'; - url = "https://www.theguardian.com/sport/formulaone" - response = httpx.get(url) - response.raise_for_status() + const url = "https://www.theguardian.com/sport/formulaone"; + const response = await fetch(url); - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") + if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); - for title in soup.select("#maincontent ul li h3"): - print(title.text) + $("#maincontent ul li h3").each((i, element) => { + console.log($(element).text()); + }); + } else { + throw new Error(`HTTP ${response.status}`); + } ```
diff --git a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md index 974f41504..e0730c9fd 100644 --- a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md @@ -162,7 +162,7 @@ We can use Beautiful Soup's `.contents` property to access individual nodes. It ["\n", Sale price, "$74.95"] ``` -It seems like we can read the last element to get the actual amount from a list like the above. Let's fix our program: +It seems like we can read the last element to get the actual amount. Let's fix our program: ```py import httpx From e53cc09ba28ec0fffee4a3f06bf7432679bce817 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 24 Jun 2025 12:15:12 +0200 Subject: [PATCH 10/26] feat: make the example longer, because Congo can uncover some mistakes in the solution --- .../06_locating_elements.md | 10 ++++++++++ .../scraping_basics_python/06_locating_elements.md | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md index bbc5d0e62..21342a6eb 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md @@ -216,6 +216,16 @@ Algeria Angola Benin Botswana +Burkina Faso +Burundi +Cameroon +Cape Verde +Central African Republic +Chad +Comoros +Democratic Republic of the Congo +Republic of the Congo +Djibouti ... ``` diff --git a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md index e0730c9fd..4193c0b13 100644 --- a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md @@ -228,6 +228,16 @@ Algeria Angola Benin Botswana +Burkina Faso +Burundi +Cameroon +Cape Verde +Central African Republic +Chad +Comoros +Democratic Republic of the Congo +Republic of the Congo +Djibouti ... ``` From 8b7117a6b50384c7a8c5b48d339402669f946c3e Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 24 Jun 2025 15:53:54 +0200 Subject: [PATCH 11/26] fix: update extracting to be about JS --- .../07_extracting_data.md | 434 ++++++++++-------- .../07_extracting_data.md | 36 +- 2 files changed, 255 insertions(+), 215 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md b/sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md index 5a156a682..61239c7c0 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md @@ -15,7 +15,7 @@ import Exercises from './_exercises.mdx'; Locating the right HTML elements is the first step of a successful data extraction, so it's no surprise that we're already close to having the data in the correct form. The last bit that still requires our attention is the price: ```text -$ python main.py +$ node index.js JBL Flip 4 Waterproof Portable Bluetooth Speaker | $74.95 Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | From $1,398.00 ... @@ -35,172 +35,183 @@ It's because some products have variants with different prices. Later in the cou Ideally we'd go and discuss the problem with those who are about to use the resulting data. For their purposes, is the fact that some prices are just minimum prices important? What would be the most useful representation of the range for them? Maybe they'd tell us that it's okay if we just remove the `From` prefix? -```py -price_text = product.select_one(".price").contents[-1] -price = price_text.removeprefix("From ") +```js +const priceText = price.text().replace("From ", ""); ``` In other cases, they'd tell us the data must include the range. And in cases when we just don't know, the safest option is to include all the information we have and leave the decision on what's important to later stages. One approach could be having the exact and minimum prices as separate values. If we don't know the exact price, we leave it empty: -```py -price_text = product.select_one(".price").contents[-1] -if price_text.startswith("From "): - min_price = price_text.removeprefix("From ") - price = None -else: - min_price = price_text - price = min_price +```js +const priceRange = { minPrice: null, price: null }; +const priceText = price.text() +if (priceText.startsWith("From ")) { + priceRange.minPrice = priceText.replace("From ", ""); +} else { + priceRange.minPrice = priceText; + priceRange.price = priceRange.minPrice; +} ``` :::tip Built-in string methods -If you're not proficient in Python's string methods, [.startswith()](https://docs.python.org/3/library/stdtypes.html#str.startswith) checks the beginning of a given string, and [.removeprefix()](https://docs.python.org/3/library/stdtypes.html#str.removeprefix) removes something from the beginning of a given string. +If you're not proficient in JavaScript's string methods, [.startsWith()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/startsWith) checks the beginning of a given string, and [.replace()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/replace) changes part of a given string. ::: The whole program would look like this: -```py -import httpx -from bs4 import BeautifulSoup - -url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -response = httpx.get(url) -response.raise_for_status() - -html_code = response.text -soup = BeautifulSoup(html_code, "html.parser") - -for product in soup.select(".product-item"): - title = product.select_one(".product-item__title").text - - price_text = product.select_one(".price").contents[-1] - if price_text.startswith("From "): - min_price = price_text.removeprefix("From ") - price = None - else: - min_price = price_text - price = min_price - - print(title, min_price, price, sep=" | ") +```js +import * as cheerio from 'cheerio'; + +const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; +const response = await fetch(url); + +if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + + $(".product-item").each((i, element) => { + const productItem = $(element); + + const title = productItem.find(".product-item__title"); + const titleText = title.text(); + + const price = productItem.find(".price").contents().last(); + const priceRange = { minPrice: null, price: null }; + const priceText = price.text(); + if (priceText.startsWith("From ")) { + priceRange.minPrice = priceText.replace("From ", ""); + } else { + priceRange.minPrice = priceText; + priceRange.price = priceRange.minPrice; + } + + console.log(`${titleText} | ${priceRange.minPrice} | ${priceRange.price}`); + }); +} else { + throw new Error(`HTTP ${response.status}`); +} ``` ## Removing white space Often, the strings we extract from a web page start or end with some amount of whitespace, typically space characters or newline characters, which come from the [indentation](https://en.wikipedia.org/wiki/Indentation_(typesetting)#Indentation_in_programming) of the HTML tags. -We call the operation of removing whitespace _stripping_ or _trimming_, and it's so useful in many applications that programming languages and libraries include ready-made tools for it. Let's add Python's built-in [.strip()](https://docs.python.org/3/library/stdtypes.html#str.strip): +We call the operation of removing whitespace _trimming_ or _stripping_, and it's so useful in many applications that programming languages and libraries include ready-made tools for it. Let's add JavaScript's built-in [.trim()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/trim): -```py -title = product.select_one(".product-item__title").text.strip() +```js +const titleText = title.text().trim(); -price_text = product.select_one(".price").contents[-1].strip() +const priceText = price.text().trim(); ``` -:::info Handling strings in Beautiful Soup - -Beautiful Soup offers several attributes when it comes to working with strings: - -- `.string`, which often is like `.text`, -- `.strings`, which [returns a list of all nested textual nodes](https://beautiful-soup-4.readthedocs.io/en/latest/#strings-and-stripped-strings), -- `.stripped_strings`, which does the same but with whitespace removed. - -These might be useful in some complex scenarios, but in our case, they won't make scraping the title or price any shorter or more elegant. - -::: - ## Removing dollar sign and commas -We got rid of the `From` and possible whitespace, but we still can't save the price as a number in our Python program: +We got rid of the `From` and possible whitespace, but we still can't save the price as a number in our JavaScript program: -```py ->>> price = "$1,998.00" ->>> float(price) -Traceback (most recent call last): - File "", line 1, in -ValueError: could not convert string to float: '$1,998.00' +```js +> const priceText = "$1,998.00" +> parseFloat(priceText) +NaN ``` -:::tip Interactive Python +:::tip Interactive JavaScript -The demonstration above is inside the Python's [interactive REPL](https://realpython.com/interacting-with-python/). It's a useful playground where you can try how code behaves before you use it in your program. +The demonstration above is inside the Node.js' [interactive REPL](https://nodejs.org/en/learn/command-line/how-to-use-the-nodejs-repl). It's similar to running arbitrary code in your browser's DevTools Console, and it's a useful playground where you can try how code behaves before you use it in your program. ::: -We need to remove the dollar sign and the decimal commas. For this type of cleaning, [regular expressions](https://docs.python.org/3/library/re.html) are often the best tool for the job, but in this case [`.replace()`](https://docs.python.org/3/library/stdtypes.html#str.replace) is also sufficient: +We need to remove the dollar sign and the decimal commas. For this type of cleaning, [regular expressions](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions) are often the best tool for the job, but in this case [`.replace()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/replace) is also sufficient: -```py -price_text = ( - product - .select_one(".price") - .contents[-1] - .strip() - .replace("$", "") - .replace(",", "") -) +```js +const priceText = price + .text() + .trim() + .replace("$", "") + .replace(",", ""); ``` ## Representing money in programs -Now we should be able to add `float()`, so that we have the prices not as a text, but as numbers: - -```py -if price_text.startswith("From "): - min_price = float(price_text.removeprefix("From ")) - price = None -else: - min_price = float(price_text) - price = min_price +Now we should be able to add `parseFloat()`, so that we have the prices not as a text, but as numbers: + +```js +const priceRange = { minPrice: null, price: null }; +const priceText = price.text() +if (priceText.startsWith("From ")) { + priceRange.minPrice = parseFloat(priceText.replace("From ", "")); +} else { + priceRange.minPrice = parseFloat(priceText); + priceRange.price = priceRange.minPrice; +} ``` -Great! Only if we didn't overlook an important pitfall called [floating-point error](https://en.wikipedia.org/wiki/Floating-point_error_mitigation). In short, computers save `float()` numbers in a way which isn't always reliable: +Great! Only if we didn't overlook an important pitfall called [floating-point error](https://en.wikipedia.org/wiki/Floating-point_error_mitigation). In short, computers save floating point numbers in a way which isn't always reliable: ```py ->>> 0.1 + 0.2 +> 0.1 + 0.2 0.30000000000000004 ``` -These errors are small and usually don't matter, but sometimes they can add up and cause unpleasant discrepancies. That's why it's typically best to avoid `float()` when working with money. Let's instead use Python's built-in [`Decimal()`](https://docs.python.org/3/library/decimal.html) type: +These errors are small and usually don't matter, but sometimes they can add up and cause unpleasant discrepancies. That's why it's typically best to avoid floating point numbers when working with money. We won't store dollars, but cents: -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal - -url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -response = httpx.get(url) -response.raise_for_status() - -html_code = response.text -soup = BeautifulSoup(html_code, "html.parser") - -for product in soup.select(".product-item"): - title = product.select_one(".product-item__title").text.strip() - - price_text = ( - product - .select_one(".price") - .contents[-1] - .strip() - .replace("$", "") - .replace(",", "") - ) - if price_text.startswith("From "): - min_price = Decimal(price_text.removeprefix("From ")) - price = None - else: - min_price = Decimal(price_text) - price = min_price - - print(title, min_price, price, sep=" | ") +```js +const priceText = price + .text() + .trim() + .replace("$", "") +// highlight-next-line + .replace(".", "") + .replace(",", ""); +``` + +In this case, removing the dot from the price text is the same as if we multiplied all the numbers with 100, effectively converting dollars to cents. This is how the whole program looks like now: + +```js +import * as cheerio from 'cheerio'; + +const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; +const response = await fetch(url); + +if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + + $(".product-item").each((i, element) => { + const productItem = $(element); + + const title = productItem.find(".product-item__title"); + const titleText = title.text().trim(); + + const price = productItem.find(".price").contents().last(); + const priceRange = { minPrice: null, price: null }; + const priceText = price + .text() + .trim() + .replace("$", "") + .replace(".", "") + .replace(",", ""); + + if (priceText.startsWith("From ")) { + priceRange.minPrice = parseInt(priceText.replace("From ", "")); + } else { + priceRange.minPrice = parseInt(priceText); + priceRange.price = priceRange.minPrice; + } + + console.log(`${titleText} | ${priceRange.minPrice} | ${priceRange.price}`); + }); +} else { + throw new Error(`HTTP ${response.status}`); +} ``` If we run the code above, we have nice, clean data about all the products! ```text -$ python main.py -JBL Flip 4 Waterproof Portable Bluetooth Speaker | 74.95 | 74.95 -Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 1398.00 | None +$ node index.js +JBL Flip 4 Waterproof Portable Bluetooth Speaker | 7495 | 7495 +Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 139800 | null ... ``` @@ -215,83 +226,103 @@ Well, not to spoil the excitement, but in its current form, the data isn't very Change our scraper so that it extracts how many units of each product are on stock. Your program should print the following. Note the unit amounts at the end of each line: ```text -JBL Flip 4 Waterproof Portable Bluetooth Speaker 672 -Sony XBR-950G BRAVIA 4K HDR Ultra HD TV 77 -Sony SACS9 10" Active Subwoofer 7 -Sony PS-HX500 Hi-Res USB Turntable 15 -Klipsch R-120SW Powerful Detailed Home Speaker - Unit 0 -Denon AH-C720 In-Ear Headphones 236 +JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672 +Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77 +Sony SACS9 10" Active Subwoofer | 7 +Sony PS-HX500 Hi-Res USB Turntable | 15 +Klipsch R-120SW Powerful Detailed Home Speaker - Unit | 0 +Denon AH-C720 In-Ear Headphones | 236 ... ```
Solution - ```py - import httpx - from bs4 import BeautifulSoup - - url = "https://warehouse-theme-metal.myshopify.com/collections/sales" - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - - for product in soup.select(".product-item"): - title = product.select_one(".product-item__title").text.strip() - - units_text = ( - product - .select_one(".product-item__inventory") - .text - .removeprefix("In stock,") - .removeprefix("Only") - .removesuffix(" left") - .removesuffix("units") - .strip() - ) - if "Sold out" in units_text: - units = 0 - else: - units = int(units_text) - - print(title, units) + ```js + import * as cheerio from 'cheerio'; + + const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; + const response = await fetch(url); + + if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + + $(".product-item").each((i, element) => { + const productItem = $(element); + + const title = productItem.find(".product-item__title"); + const titleText = title.text().trim(); + + const unitsText = productItem + .find(".product-item__inventory") + .text() + .replace("In stock,", "") + .replace("Only", "") + .replace(" left", "") + .replace("units", "") + .trim(); + const unitsCount = unitsText === "Sold out" ? 0 + : parseInt(unitsText); + + console.log(`${titleText} | ${unitsCount}`); + }); + } else { + throw new Error(`HTTP ${response.status}`); + } ``` + :::tip Conditional (ternary) operator + + For brevity, the solution uses the [conditional (ternary) operator](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Conditional_operator). You can achieve the same with a plain `if` and `else` block. + + ::: +
### Use regular expressions -Simplify the code from previous exercise. Use [regular expressions](https://docs.python.org/3/library/re.html) to parse the number of units. You can match digits using a range like `[0-9]` or by a special sequence `\d`. To match more characters of the same type you can use `+`. +Simplify the code from previous exercise. Use [regular expressions](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions) to parse the number of units. You can match digits using a range like `[0-9]` or by a special sequence `\d`. To match more characters of the same type you can use `+`.
Solution - ```py - import re - import httpx - from bs4 import BeautifulSoup + ```js + import * as cheerio from 'cheerio'; + + const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; + const response = await fetch(url); - url = "https://warehouse-theme-metal.myshopify.com/collections/sales" - response = httpx.get(url) - response.raise_for_status() + if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") + $(".product-item").each((i, element) => { + const productItem = $(element); - for product in soup.select(".product-item"): - title = product.select_one(".product-item__title").text.strip() + const title = productItem.find(".product-item__title"); + const titleText = title.text().trim(); - units_text = product.select_one(".product-item__inventory").text - if re_match := re.search(r"\d+", units_text): - units = int(re_match.group()) - else: - units = 0 + const unitsText = productItem + .find(".product-item__inventory") + .text() + .trim(); + const unitsCount = unitsText === "Sold out" ? 0 + : parseInt(unitsText.match(/\d+/)); - print(title, units) + console.log(`${titleText} | ${unitsCount}`); + }); + } else { + throw new Error(`HTTP ${response.status}`); + } ``` + :::tip Conditional (ternary) operator + + For brevity, the solution uses the [conditional (ternary) operator](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Conditional_operator). You can achieve the same with a plain `if` and `else` block. + + ::: +
### Scrape publish dates of F1 news @@ -305,42 +336,51 @@ https://www.theguardian.com/sport/formulaone Your program should print something like the following. Note the dates at the end of each line: ```text -Wolff confident Mercedes are heading to front of grid after Canada improvement 2024-06-10 -Frustrated Lando Norris blames McLaren team for missed chance 2024-06-09 -Max Verstappen wins Canadian Grand Prix: F1 – as it happened 2024-06-09 +Brad Pitt in the paddock: how F1 the Movie went deep to keep fans coming | Fri Jun 20 2025 +Wolff hits out at Red Bull protest after Russell’s Canadian GP win | Tue Jun 17 2025 +F1 the Movie review – spectacular macho melodrama handles Brad Pitt with panache | Tue Jun 17 2025 +Hamilton reveals distress over ‘devastating’ groundhog accident at Canadian F1 GP | Mon Jun 16 2025 ... ``` Hints: - HTML's `time` element can have an attribute `datetime`, which [contains data in a machine-readable format](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/time), such as the ISO 8601. -- Beautiful Soup gives you [access to attributes as if they were dictionary keys](https://beautiful-soup-4.readthedocs.io/en/latest/#attributes). -- In Python you can create `datetime` objects using `datetime.fromisoformat()`, a [built-in method for parsing ISO 8601 strings](https://docs.python.org/3/library/datetime.html#datetime.datetime.fromisoformat). -- To get just the date part, you can call `.date()` on any `datetime` object. +- Cheerio gives you [.attr()](https://cheerio.js.org/docs/api/classes/Cheerio#attr) to access attributes. +- In JavaScript you can use an ISO 8601 string to create a [`Date`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Date) object. +- To get the date, you can call `.toDateString()` on `Date` objects.
Solution - ```py - import httpx - from bs4 import BeautifulSoup - from datetime import datetime - - url = "https://www.theguardian.com/sport/formulaone" - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - - for article in soup.select("#maincontent ul li"): - title = article.select_one("h3").text.strip() - - time_iso = article.select_one("time")["datetime"].strip() - published_at = datetime.fromisoformat(time_iso) - published_on = published_at.date() - - print(title, published_on) + ```js + import * as cheerio from 'cheerio'; + + const url = "https://www.theguardian.com/sport/formulaone"; + const response = await fetch(url); + + if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + + $("#maincontent ul li").each((i, element) => { + const article = $(element); + + const titleText = article + .find("h3") + .text() + .trim(); + const dateText = article + .find("time") + .attr("datetime") + .trim(); + const date = new Date(dateText); + + console.log(`${titleText} | ${date.toDateString()}`); + }); + } else { + throw new Error(`HTTP ${response.status}`); + } ```
diff --git a/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md b/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md index 7250035a5..d9ad7bc09 100644 --- a/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md +++ b/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md @@ -152,14 +152,14 @@ else: price = min_price ``` -Great! Only if we didn't overlook an important pitfall called [floating-point error](https://en.wikipedia.org/wiki/Floating-point_error_mitigation). In short, computers save `float()` numbers in a way which isn't always reliable: +Great! Only if we didn't overlook an important pitfall called [floating-point error](https://en.wikipedia.org/wiki/Floating-point_error_mitigation). In short, computers save floating point numbers in a way which isn't always reliable: ```py >>> 0.1 + 0.2 0.30000000000000004 ``` -These errors are small and usually don't matter, but sometimes they can add up and cause unpleasant discrepancies. That's why it's typically best to avoid `float()` when working with money. Let's instead use Python's built-in [`Decimal()`](https://docs.python.org/3/library/decimal.html) type: +These errors are small and usually don't matter, but sometimes they can add up and cause unpleasant discrepancies. That's why it's typically best to avoid floating point numbers when working with money. Let's instead use Python's built-in [`Decimal()`](https://docs.python.org/3/library/decimal.html) type: ```py import httpx @@ -214,12 +214,12 @@ Well, not to spoil the excitement, but in its current form, the data isn't very Change our scraper so that it extracts how many units of each product are on stock. Your program should print the following. Note the unit amounts at the end of each line: ```text -JBL Flip 4 Waterproof Portable Bluetooth Speaker 672 -Sony XBR-950G BRAVIA 4K HDR Ultra HD TV 77 -Sony SACS9 10" Active Subwoofer 7 -Sony PS-HX500 Hi-Res USB Turntable 15 -Klipsch R-120SW Powerful Detailed Home Speaker - Unit 0 -Denon AH-C720 In-Ear Headphones 236 +JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672 +Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77 +Sony SACS9 10" Active Subwoofer | 7 +Sony PS-HX500 Hi-Res USB Turntable | 15 +Klipsch R-120SW Powerful Detailed Home Speaker - Unit | 0 +Denon AH-C720 In-Ear Headphones | 236 ... ``` @@ -255,7 +255,7 @@ Denon AH-C720 In-Ear Headphones 236 else: units = int(units_text) - print(title, units) + print(title, units, sep="|") ```
@@ -288,7 +288,7 @@ Simplify the code from previous exercise. Use [regular expressions](https://docs else: units = 0 - print(title, units) + print(title, units, sep="|") ``` @@ -304,9 +304,10 @@ https://www.theguardian.com/sport/formulaone Your program should print something like the following. Note the dates at the end of each line: ```text -Wolff confident Mercedes are heading to front of grid after Canada improvement 2024-06-10 -Frustrated Lando Norris blames McLaren team for missed chance 2024-06-09 -Max Verstappen wins Canadian Grand Prix: F1 – as it happened 2024-06-09 +Brad Pitt in the paddock: how F1 the Movie went deep to keep fans coming | Fri Jun 20 2025 +Wolff hits out at Red Bull protest after Russell’s Canadian GP win | Tue Jun 17 2025 +F1 the Movie review – spectacular macho melodrama handles Brad Pitt with panache | Tue Jun 17 2025 +Hamilton reveals distress over ‘devastating’ groundhog accident at Canadian F1 GP | Mon Jun 16 2025 ... ``` @@ -315,7 +316,7 @@ Hints: - HTML's `time` element can have an attribute `datetime`, which [contains data in a machine-readable format](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/time), such as the ISO 8601. - Beautiful Soup gives you [access to attributes as if they were dictionary keys](https://beautiful-soup-4.readthedocs.io/en/latest/#attributes). - In Python you can create `datetime` objects using `datetime.fromisoformat()`, a [built-in method for parsing ISO 8601 strings](https://docs.python.org/3/library/datetime.html#datetime.datetime.fromisoformat). -- To get just the date part, you can call `.date()` on any `datetime` object. +- To get the date, you can call `.strftime('%a %b %d %Y')` on `datetime` objects.
Solution @@ -335,11 +336,10 @@ Hints: for article in soup.select("#maincontent ul li"): title = article.select_one("h3").text.strip() - time_iso = article.select_one("time")["datetime"].strip() - published_at = datetime.fromisoformat(time_iso) - published_on = published_at.date() + date_iso = article.select_one("time")["datetime"].strip() + date = datetime.fromisoformat(date_iso) - print(title, published_on) + print(title, date.strftime('%a %b %d %Y'), sep="|") ```
From 7f46f01553fb5a896ac41253888999a8d9098de9 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 27 Jun 2025 14:21:14 +0200 Subject: [PATCH 12/26] style: change order, first json, then csv Making this change because in Python it doesn't matter and in JavaScript it's easier to start with JSON, which is built-in, and only then move to CSV, which requires an additional library. --- .../webscraping/scraping_basics_python/08_saving_data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md index 6567e24ef..27656f7f0 100644 --- a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md @@ -168,7 +168,7 @@ Bob,42,"reading, TypeScript" In the CSV format, if a value contains commas, we should enclose it in quotes. When we open the file in a text editor of our choice, we can see that the writer automatically handled this. -When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it. If you're using a different operating system, try opening the file with any spreadsheet program you have. +When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it as well. If you're using a different operating system, try opening the file with any spreadsheet program you have. ![CSV example preview](images/csv-example.png) From 4cdf396eef38663c5fd38c4952c1b7af15619015 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 27 Jun 2025 14:46:41 +0200 Subject: [PATCH 13/26] fix: use --save with npm install in the parsing lesson --- .../webscraping/scraping_basics_javascript2/05_parsing_html.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md index 3f00410e5..f2c22173d 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/05_parsing_html.md @@ -33,7 +33,7 @@ While [Bobince's infamous StackOverflow answer](https://stackoverflow.com/a/1732 We'll choose [Cheerio](https://cheerio.js.org/) as our parser, as it's a popular library which can process even non-standard, broken markup. This is useful for scraping, because real-world websites often contain all sorts of errors and discrepancies. In the project directory, we'll run the following to install the Cheerio package: ```text -$ npm install cheerio +$ npm install cheerio --save added 23 packages, and audited 24 packages in 1s ... From 15e8c3f9d591c0d538aea604c4103d3d750ee2f3 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 27 Jun 2025 15:41:23 +0200 Subject: [PATCH 14/26] fix: various improvements to the Python lesson about saving data --- .../webscraping/scraping_basics_python/08_saving_data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md index 27656f7f0..6567e24ef 100644 --- a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md @@ -168,7 +168,7 @@ Bob,42,"reading, TypeScript" In the CSV format, if a value contains commas, we should enclose it in quotes. When we open the file in a text editor of our choice, we can see that the writer automatically handled this. -When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it as well. If you're using a different operating system, try opening the file with any spreadsheet program you have. +When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it. If you're using a different operating system, try opening the file with any spreadsheet program you have. ![CSV example preview](images/csv-example.png) From d8973ff1433be6984d6f1049441600e93353108d Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 27 Jun 2025 15:41:44 +0200 Subject: [PATCH 15/26] fix: update saving to be about JS --- .../08_saving_data.md | 292 +++++++++--------- 1 file changed, 140 insertions(+), 152 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md index e1ad7365a..72a6e140a 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md @@ -13,9 +13,9 @@ unlisted: true We managed to scrape data about products and print it, with each product separated by a new line and each field separated by the `|` character. This already produces structured text that can be parsed, i.e., read programmatically. ```text -$ python main.py -JBL Flip 4 Waterproof Portable Bluetooth Speaker | 74.95 | 74.95 -Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 1398.00 | None +$ node index.js +JBL Flip 4 Waterproof Portable Bluetooth Speaker | 7495 | 7495 +Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 139800 | null ... ``` @@ -27,220 +27,208 @@ We should use widely popular formats that have well-defined solutions for all th Producing results line by line is an efficient approach to handling large datasets, but to simplify this lesson, we'll store all our data in one variable. This'll take three changes to our program: -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal - -url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -response = httpx.get(url) -response.raise_for_status() - -html_code = response.text -soup = BeautifulSoup(html_code, "html.parser") - -# highlight-next-line -data = [] -for product in soup.select(".product-item"): - title = product.select_one(".product-item__title").text.strip() - - price_text = ( - product - .select_one(".price") - .contents[-1] - .strip() - .replace("$", "") - .replace(",", "") - ) - if price_text.startswith("From "): - min_price = Decimal(price_text.removeprefix("From ")) - price = None - else: - min_price = Decimal(price_text) - price = min_price - - # highlight-next-line - data.append({"title": title, "min_price": min_price, "price": price}) - -# highlight-next-line -print(data) +```js +import * as cheerio from 'cheerio'; + +const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; +const response = await fetch(url); + +if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + + // highlight-next-line + const data = []; + $(".product-item").each((i, element) => { + const productItem = $(element); + + const title = productItem.find(".product-item__title"); + const titleText = title.text().trim(); + + const price = productItem.find(".price").contents().last(); + const priceRange = { minPrice: null, price: null }; + const priceText = price + .text() + .trim() + .replace("$", "") + .replace(".", "") + .replace(",", ""); + + if (priceText.startsWith("From ")) { + priceRange.minPrice = parseInt(priceText.replace("From ", "")); + } else { + priceRange.minPrice = parseInt(priceText); + priceRange.price = priceRange.minPrice; + } + + // highlight-next-line + data.push({ title: titleText, ...priceRange }) + }); + + // highlight-next-line + console.log(data); +} else { + throw new Error(`HTTP ${response.status}`); +} ``` -Before looping over the products, we prepare an empty list. Then, instead of printing each line, we append the data of each product to the list in the form of a Python dictionary. At the end of the program, we print the entire list at once. +Before looping over the products, we prepare an empty array. Then, instead of printing each line, we append the data of each product to the array in the form of a JavaScript object. At the end of the program, we print the entire array at once. ```text -$ python main.py -[{'title': 'JBL Flip 4 Waterproof Portable Bluetooth Speaker', 'min_price': Decimal('74.95'), 'price': Decimal('74.95')}, {'title': 'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV', 'min_price': Decimal('1398.00'), 'price': None}, ...] +$ node index.js +[ + { + title: 'JBL Flip 4 Waterproof Portable Bluetooth Speaker', + minPrice: 7495, + price: 7495 + }, + { + title: 'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV', + minPrice: 139800, + price: null + }, + ... +] ``` -:::tip Pretty print +:::tip Spread syntax + +The three dots in `{ title: titleText, ...priceRange }` are called [spread syntax](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Spread_syntax). It's the same as if we wrote the following: -If you find the complex data structures printed by `print()` difficult to read, try using [`pp()`](https://docs.python.org/3/library/pprint.html#pprint.pp) from the `pprint` module instead. +```js +{ + title: titleText, + minPrice: priceRange.minPrice, + price: priceRange.price, +} +``` ::: -## Saving data as CSV +## Saving data as JSON -The CSV format is popular among data analysts because a wide range of tools can import it, including spreadsheets apps like LibreOffice Calc, Microsoft Excel, Apple Numbers, and Google Sheets. +The JSON format is popular primarily among developers. We use it for storing data, configuration files, or as a way to transfer data between programs (e.g., APIs). Its origin stems from the syntax of JavaScript objects, but people now use it accross programming languages. -In Python, it's convenient to read and write CSV files, thanks to the [`csv`](https://docs.python.org/3/library/csv.html) standard library module. First let's try something small in the Python's interactive REPL to familiarize ourselves with the basic usage: +We'll begin with importing the `writeFile` function from the Node.js standard library, so that we can, well, write files: -```py ->>> import csv ->>> with open("data.csv", "w") as file: -... writer = csv.DictWriter(file, fieldnames=["name", "age", "hobbies"]) -... writer.writeheader() -... writer.writerow({"name": "Alice", "age": 24, "hobbies": "kickbox, Python"}) -... writer.writerow({"name": "Bob", "age": 42, "hobbies": "reading, TypeScript"}) -... +```js +import * as cheerio from 'cheerio'; +// highlight-next-line +import { writeFile } from "fs/promises"; ``` -We first opened a new file for writing and created a `DictWriter()` instance with the expected field names. We instructed it to write the header row first and then added two more rows containing actual data. The code produced a `data.csv` file in the same directory where we're running the REPL. It has the following contents: +Next, instead of printing the data, we'll finish the program by exporting it to JSON. Let's replace the line `console.log(data)` with the following: -```csv title=data.csv -name,age,hobbies -Alice,24,"kickbox, Python" -Bob,42,"reading, TypeScript" +```js +const jsonData = JSON.stringify(data); +await writeFile('products.json', jsonData); ``` -In the CSV format, if values contain commas, we should enclose them in quotes. You can see that the writer automatically handled this. - -When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it as well. If you're using a different operating system, try opening the file with any spreadsheet program you have. - -![CSV example preview](images/csv-example.png) - -Now that's nice, but we didn't want Alice, Bob, kickbox, or TypeScript. What we actually want is a CSV containing `Sony XBR-950G BRAVIA 4K HDR Ultra HD TV`, right? Let's do this! First, let's add `csv` to our imports: +That's it! If we run our scraper now, it won't display any output, but it will create a `products.json` file in the current working directory, which contains all the data about the listed products: -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal -# highlight-next-line -import csv + +```json title=products.json +[{"title":"JBL Flip 4 Waterproof Portable Bluetooth Speaker","minPrice":7495,"price":7495},{"title":"Sony XBR-950G BRAVIA 4K HDR Ultra HD TV","minPrice":139800,"price":null},...] ``` -Next, instead of printing the data, we'll finish the program by exporting it to CSV. Replace `print(data)` with the following: +If you skim through the data, you'll notice that the `JSON.stringify()` function handled some potential issues, such as escaping double quotes found in one of the titles by adding a backslash: -```py -with open("products.csv", "w") as file: - writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"]) - writer.writeheader() - for row in data: - writer.writerow(row) +```json +{"title":"Sony SACS9 10\" Active Subwoofer","minPrice":15800,"price":15800} ``` -If we run our scraper now, it won't display any output, but it will create a `products.csv` file in the current working directory, which contains all the data about the listed products. - -![CSV preview](images/csv.png) - -## Saving data as JSON - -The JSON format is popular primarily among developers. We use it for storing data, configuration files, or as a way to transfer data between programs (e.g., APIs). Its origin stems from the syntax of objects in the JavaScript programming language, which is similar to the syntax of Python dictionaries. +:::tip Pretty JSON -In Python, there's a [`json`](https://docs.python.org/3/library/json.html) standard library module, which is so straightforward that we can start using it in our code right away. We'll need to begin with imports: +While a compact JSON file without any whitespace is efficient for computers, it can be difficult for humans to read. You can call `JSON.stringify(data, null, 2)` for prettier output. See [documentation](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/stringify) for explanation of the parameters and more examples. -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal -import csv -# highlight-next-line -import json -``` +::: -Next, let’s append one more export to end of the source code of our scraper: +## Saving data as CSV -```py -with open("products.json", "w") as file: - json.dump(data, file) -``` +The CSV format is popular among data analysts because a wide range of tools can import it, including spreadsheets apps like LibreOffice Calc, Microsoft Excel, Apple Numbers, and Google Sheets. -That’s it! If we run the program now, it should also create a `products.json` file in the current working directory: +Neither JavaScript itself nor Node.js offers anything built-in to read and write CSV, so we'll need to install a library. We'll use [json2csv](https://juanjodiaz.github.io/json2csv/), a _de facto_ standard for working with CSV in JavaScript: ```text -$ python main.py -Traceback (most recent call last): - ... - raise TypeError(f'Object of type {o.__class__.__name__} ' -TypeError: Object of type Decimal is not JSON serializable -``` - -Ouch! JSON supports integers and floating-point numbers, but there's no guidance on how to handle `Decimal`. To maintain precision, it's common to store monetary values as strings in JSON files. But this is a convention, not a standard, so we need to handle it manually. We'll pass a custom function to `json.dump()` to serialize objects that it can't handle directly: +$ npm install @json2csv/node --save -```py -def serialize(obj): - if isinstance(obj, Decimal): - return str(obj) - raise TypeError("Object not JSON serializable") - -with open("products.json", "w") as file: - json.dump(data, file, default=serialize) +added 4 packages, and audited 28 packages in 1s +... ``` -Now the program should work as expected, producing a JSON file with the following content: +Once installed, we can add the following line to our imports: - -```json title=products.json -[{"title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker", "min_price": "74.95", "price": "74.95"}, {"title": "Sony XBR-950G BRAVIA 4K HDR Ultra HD TV", "min_price": "1398.00", "price": null}, ...] +```js +import * as cheerio from 'cheerio'; +import { writeFile } from "fs/promises"; +// highlight-next-line +import { AsyncParser } from '@json2csv/node'; ``` -If you skim through the data, you'll notice that the `json.dump()` function handled some potential issues, such as escaping double quotes found in one of the titles by adding a backslash: +Then, let's add one more data export near the end of the source code of our scraper: -```json -{"title": "Sony SACS9 10\" Active Subwoofer", "min_price": "158.00", "price": "158.00"} +```js +const parser = new AsyncParser(); +const csvData = await parser.parse(data).promise(); +await writeFile("products.csv", csvData); ``` -:::tip Pretty JSON +The program should now also produce a `data.csv` file. When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it. If you're using a different operating system, try opening the file with any spreadsheet program you have. -While a compact JSON file without any whitespace is efficient for computers, it can be difficult for humans to read. You can pass `indent=2` to `json.dump()` for prettier output. +![CSV preview](images/csv.png) -Also, if your data contains non-English characters, set `ensure_ascii=False`. By default, Python encodes everything except [ASCII](https://en.wikipedia.org/wiki/ASCII), which means it would save [Bún bò Nam Bô](https://vi.wikipedia.org/wiki/B%C3%BAn_b%C3%B2_Nam_B%E1%BB%99) as `B\\u00fan b\\u00f2 Nam B\\u00f4`. +In the CSV format, if a value contains commas, we should enclose it in quotes. If it contains quotes, we should double them. When we open the file in a text editor of our choice, we can see that the library automatically handled this: -::: +```csv title=data.csv +"title","minPrice","price" +"JBL Flip 4 Waterproof Portable Bluetooth Speaker",7495,7495 +"Sony XBR-950G BRAVIA 4K HDR Ultra HD TV",139800, +"Sony SACS9 10"" Active Subwoofer",15800,15800 +... +"Samsung Surround Sound Bar Home Speaker, Set of 7 (HW-NW700/ZA)",64799,64799 +... +``` -We've built a Python application that downloads a product listing, parses the data, and saves it in a structured format for further use. But the data still has gaps: for some products, we only have the min price, not the actual prices. In the next lesson, we'll attempt to scrape more details from all the product pages. +We've built a Node.js application that downloads a product listing, parses the data, and saves it in a structured format for further use. But the data still has gaps: for some products, we only have the min price, not the actual prices. In the next lesson, we'll attempt to scrape more details from all the product pages. --- ## Exercises -In this lesson, you learned how to create export files in two formats. The following challenges are designed to help you empathize with the people who'd be working with them. +In this lesson, we created export files in two formats. The following challenges are designed to help you empathize with the people who'd be working with them. -### Process your CSV +### Process your JSON -Open the `products.csv` file in a spreadsheet app. Use the app to find all products with a min price greater than $500. +Write a new Node.js program that reads `products.json`, finds all products with a min price greater than $500, and prints each of them.
Solution - Let's use [Google Sheets](https://www.google.com/sheets/about/), which is free to use. After logging in with a Google account: - - 1. Go to **File > Import**, choose **Upload**, and select the file. Import the data using the default settings. You should see a table with all the data. - 2. Select the header row. Go to **Data > Create filter**. - 3. Use the filter icon that appears next to `min_price`. Choose **Filter by condition**, select **Greater than**, and enter **500** in the text field. Confirm the dialog. You should see only the filtered data. + ```js + import { readFile } from "fs/promises"; - ![CSV in Google Sheets](images/csv-sheets.png) + const jsonData = await readFile("products.json"); + const data = JSON.parse(jsonData); + data + .filter(row => row.minPrice > 50000) + .forEach(row => console.log(row)); + ```
-### Process your JSON +### Process your CSV -Write a new Python program that reads `products.json`, finds all products with a min price greater than $500, and prints each one using [`pp()`](https://docs.python.org/3/library/pprint.html#pprint.pp). +Open the `products.csv` file we created in the lesson using a spreadsheet application. Then, in the app, find all products with a min price greater than $500.
Solution - ```py - import json - from pprint import pp - from decimal import Decimal + Let's use [Google Sheets](https://www.google.com/sheets/about/), which is free to use. After logging in with a Google account: - with open("products.json", "r") as file: - products = json.load(file) + 1. Go to **File > Import**, choose **Upload**, and select the file. Import the data using the default settings. You should see a table with all the data. + 2. Select the header row. Go to **Data > Create filter**. + 3. Use the filter icon that appears next to `minPrice`. Choose **Filter by condition**, select **Greater than**, and enter **500** in the text field. Confirm the dialog. You should see only the filtered data. - for product in products: - if Decimal(product["min_price"]) > 500: - pp(product) - ``` + ![CSV in Google Sheets](images/csv-sheets.png)
From 360165b5c5f32606dc012e29358ea752420ad70b Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 30 Jun 2025 10:15:04 +0200 Subject: [PATCH 16/26] fix: missing semicolon in saving data --- .../webscraping/scraping_basics_javascript2/08_saving_data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md index 72a6e140a..bfbb38369 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md @@ -62,7 +62,7 @@ if (response.ok) { } // highlight-next-line - data.push({ title: titleText, ...priceRange }) + data.push({ title: titleText, ...priceRange }); }); // highlight-next-line From fe2528c47e43e74d0a9eec35ac9d61fb5c8c8a9e Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 30 Jun 2025 10:15:38 +0200 Subject: [PATCH 17/26] feat: update getting links to be about JS --- .../09_getting_links.md | 483 +++++++++--------- 1 file changed, 238 insertions(+), 245 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md index 1eebf4cbc..288ef339a 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md @@ -31,189 +31,183 @@ This will help us figure out the actual prices of products, as right now, for so Over the course of the previous lessons, the code of our program grew to almost 50 lines containing downloading, parsing, and exporting: -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal -import json -import csv - -url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -response = httpx.get(url) -response.raise_for_status() - -html_code = response.text -soup = BeautifulSoup(html_code, "html.parser") - -data = [] -for product in soup.select(".product-item"): - title = product.select_one(".product-item__title").text.strip() - - price_text = ( - product - .select_one(".price") - .contents[-1] - .strip() - .replace("$", "") - .replace(",", "") - ) - if price_text.startswith("From "): - min_price = Decimal(price_text.removeprefix("From ")) - price = None - else: - min_price = Decimal(price_text) - price = min_price - - data.append({"title": title, "min_price": min_price, "price": price}) - -with open("products.csv", "w") as file: - writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"]) - writer.writeheader() - for row in data: - writer.writerow(row) - -def serialize(obj): - if isinstance(obj, Decimal): - return str(obj) - raise TypeError("Object not JSON serializable") - -with open("products.json", "w") as file: - json.dump(data, file, default=serialize) +```js +import * as cheerio from 'cheerio'; +import { writeFile } from 'fs/promises'; +import { AsyncParser } from '@json2csv/node'; + +const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; +const response = await fetch(url); + +if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + + const data = []; + $(".product-item").each((i, element) => { + const productItem = $(element); + + const title = productItem.find(".product-item__title"); + const titleText = title.text().trim(); + + const price = productItem.find(".price").contents().last(); + const priceRange = { minPrice: null, price: null }; + const priceText = price + .text() + .trim() + .replace("$", "") + .replace(".", "") + .replace(",", ""); + + if (priceText.startsWith("From ")) { + priceRange.minPrice = parseInt(priceText.replace("From ", "")); + } else { + priceRange.minPrice = parseInt(priceText); + priceRange.price = priceRange.minPrice; + } + + data.push({ title: titleText, ...priceRange }); + }); + + const jsonData = JSON.stringify(data); + await writeFile('products.json', jsonData); + + const parser = new AsyncParser(); + const csvData = await parser.parse(data).promise(); + await writeFile('products.csv', csvData); +} else { + throw new Error(`HTTP ${response.status}`); +} ``` -Let's introduce several functions to make the whole thing easier to digest. First, we can turn the beginning of our program into this `download()` function, which takes a URL and returns a `BeautifulSoup` instance: - -```py -def download(url): - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - return BeautifulSoup(html_code, "html.parser") +Let's introduce several functions to make the whole thing easier to digest. First, we can turn the beginning of our program into this `download()` function, which takes a URL and returns a Cheerio object: + +```js +async function download(url) { + const response = await fetch(url); + if (response.ok) { + const html = await response.text(); + return cheerio.load(html); + } else { + throw new Error(`HTTP ${response.status}`); + } +} ``` -Next, we can put parsing into a `parse_product()` function, which takes the product item element and returns the dictionary with data: - -```py -def parse_product(product): - title = product.select_one(".product-item__title").text.strip() - - price_text = ( - product - .select_one(".price") - .contents[-1] - .strip() - .replace("$", "") - .replace(",", "") - ) - if price_text.startswith("From "): - min_price = Decimal(price_text.removeprefix("From ")) - price = None - else: - min_price = Decimal(price_text) - price = min_price - - return {"title": title, "min_price": min_price, "price": price} +Next, we can put parsing into a `parseProduct()` function, which takes the product item element and returns the object with data: + +```js +function parseProduct(productItem) { + const title = productItem.find(".product-item__title"); + const titleText = title.text().trim(); + + const price = productItem.find(".price").contents().last(); + const priceRange = { minPrice: null, price: null }; + const priceText = price + .text() + .trim() + .replace("$", "") + .replace(".", "") + .replace(",", ""); + + if (priceText.startsWith("From ")) { + priceRange.minPrice = parseInt(priceText.replace("From ", "")); + } else { + priceRange.minPrice = parseInt(priceText); + priceRange.price = priceRange.minPrice; + } + + return { title: titleText, ...priceRange }; +} ``` -Now the CSV export. We'll make a small change here. Having to specify the field names is not ideal. What if we add more field names in the parsing function? We'd always have to remember to go and edit the export function as well. If we could figure out the field names in place, we'd remove this dependency. One way would be to infer the field names from the dictionary keys of the first row: +Now the JSON export. For better readability of it, let's make a small change here and set the indentation level to two spaces: -```py -def export_csv(file, data): - # highlight-next-line - fieldnames = list(data[0].keys()) - writer = csv.DictWriter(file, fieldnames=fieldnames) - writer.writeheader() - for row in data: - writer.writerow(row) +```js +async function exportJSON(data) { + return JSON.stringify(data, null, 2); +} ``` -:::note Fragile code +:::note Why asynchronous? -The code above assumes the `data` variable contains at least one item, and that all the items have the same keys. This isn't robust and could break, but in our program, this isn't a problem, and omitting these corner cases allows us to keep the code examples more succinct. +The `exportJSON()` function doesn't need to be `async` now, but keeping it makes future changes easier — like switching to an async JSON parser. It also stays consistent with the upcoming `exportCSV()` function, which must be asynchronous. ::: -The last function we'll add will take care of the JSON export. For better readability of the JSON export, let's make a small change here too and set the indentation level to two spaces: - -```py -def export_json(file, data): - def serialize(obj): - if isinstance(obj, Decimal): - return str(obj) - raise TypeError("Object not JSON serializable") +The last function we'll add will take care of the CSV export: - # highlight-next-line - json.dump(data, file, default=serialize, indent=2) +```js +async function exportCSV(data) { + const parser = new AsyncParser(); + return await parser.parse(data).promise(); +} ``` Now let's put it all together: -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal -import json -import csv - -def download(url): - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - return BeautifulSoup(html_code, "html.parser") - -def parse_product(product): - title = product.select_one(".product-item__title").text.strip() - - price_text = ( - product - .select_one(".price") - .contents[-1] - .strip() - .replace("$", "") - .replace(",", "") - ) - if price_text.startswith("From "): - min_price = Decimal(price_text.removeprefix("From ")) - price = None - else: - min_price = Decimal(price_text) - price = min_price - - return {"title": title, "min_price": min_price, "price": price} - -def export_csv(file, data): - fieldnames = list(data[0].keys()) - writer = csv.DictWriter(file, fieldnames=fieldnames) - writer.writeheader() - for row in data: - writer.writerow(row) - -def export_json(file, data): - def serialize(obj): - if isinstance(obj, Decimal): - return str(obj) - raise TypeError("Object not JSON serializable") - - json.dump(data, file, default=serialize, indent=2) - -listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -listing_soup = download(listing_url) - -data = [] -for product in listing_soup.select(".product-item"): - item = parse_product(product) - data.append(item) - -with open("products.csv", "w") as file: - export_csv(file, data) - -with open("products.json", "w") as file: - export_json(file, data) +```js +import * as cheerio from 'cheerio'; +import { writeFile } from 'fs/promises'; +import { AsyncParser } from '@json2csv/node'; + +async function download(url) { + const response = await fetch(url); + if (response.ok) { + const html = await response.text(); + return cheerio.load(html); + } else { + throw new Error(`HTTP ${response.status}`); + } +} + +function parseProduct(productItem) { + const title = productItem.find(".product-item__title"); + const titleText = title.text().trim(); + + const price = productItem.find(".price").contents().last(); + const priceRange = { minPrice: null, price: null }; + const priceText = price + .text() + .trim() + .replace("$", "") + .replace(".", "") + .replace(",", ""); + + if (priceText.startsWith("From ")) { + priceRange.minPrice = parseInt(priceText.replace("From ", "")); + } else { + priceRange.minPrice = parseInt(priceText); + priceRange.price = priceRange.minPrice; + } + + return { title: titleText, ...priceRange }; +} + +async function exportJSON(data) { + return JSON.stringify(data, null, 2); +} + +async function exportCSV(data) { + const parser = new AsyncParser(); + return await parser.parse(data).promise(); +} + +const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales" +const $ = await download(listingURL); + +const data = [] +$(".product-item").each((i, element) => { + const productItem = $(element); + const item = parseProduct(productItem); + data.push(item); +}); + +await writeFile('products.json', await exportJSON(data)); +await writeFile('products.csv', await exportCSV(data)); ``` -The program is much easier to read now. With the `parse_product()` function handy, we could also replace the convoluted loop with one that only takes up four lines of code. +The program is much easier to read now. With the `parseProduct()` function handy, we could also replace the convoluted loop with one that only takes up five lines of code. :::tip Refactoring @@ -235,35 +229,36 @@ Several methods exist for transitioning from one page to another, but the most c Text of the link ``` -In DevTools, we can see that each product title is, in fact, also a link element. We already locate the titles, so that makes our task easier. We just need to edit the code so that it extracts not only the text of the element but also the `href` attribute. Beautiful Soup elements support accessing attributes as if they were dictionary keys: +In DevTools, we can see that each product title is, in fact, also a link element. We already locate the titles, so that makes our task easier. We just need to edit the code so that it extracts not only the text of the element but also the `href` attribute. Cheerio selections support accessing attributes using the `.attr()` method: -```py -def parse_product(product): - title_element = product.select_one(".product-item__title") - title = title_element.text.strip() - url = title_element["href"] +```js +function parseProduct(productItem) { + const title = productItem.find(".product-item__title"); + const titleText = title.text().trim(); + const url = title.attr("href"); - ... + ... - return {"title": title, "min_price": min_price, "price": price, "url": url} + return { url, title: titleText, ...priceRange }; +} ``` -In the previous code example, we've also added the URL to the dictionary returned by the function. If we run the scraper now, it should produce exports where each product contains a link to its product page: +In the previous code example, we've also added the URL to the object returned by the function. If we run the scraper now, it should produce exports where each product contains a link to its product page: ```json title=products.json [ { + "url": "/products/jbl-flip-4-waterproof-portable-bluetooth-speaker", "title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker", - "min_price": "74.95", - "price": "74.95", - "url": "/products/jbl-flip-4-waterproof-portable-bluetooth-speaker" + "minPrice": 7495, + "price": 7495 }, { + "url": "/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv", "title": "Sony XBR-950G BRAVIA 4K HDR Ultra HD TV", - "min_price": "1398.00", - "price": null, - "url": "/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv" + "minPrice": 139800, + "price": null }, ... ] @@ -273,44 +268,37 @@ Hmm, but that isn't what we wanted! Where is the beginning of each URL? It turns ## Turning relative links into absolute -Browsers reading the HTML know the base address and automatically resolve such links, but we'll have to do this manually. The function [`urljoin`](https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin) from Python's standard library will help us. Let's add it to our imports first: - -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal -import json -import csv -# highlight-next-line -from urllib.parse import urljoin -``` +Browsers reading the HTML know the base address and automatically resolve such links, but we'll have to do this manually. The built-in [`URL`](https://developer.mozilla.org/en-US/docs/Web/API/URL) object will help us. -Next, we'll change the `parse_product()` function so that it also takes the base URL as an argument and then joins it with the relative URL to the product page: +We'll change the `parseProduct()` function so that it also takes the base URL as an argument and then joins it with the relative URL to the product page: -```py -# highlight-next-line -def parse_product(product, base_url): - title_element = product.select_one(".product-item__title") - title = title_element.text.strip() - # highlight-next-line - url = urljoin(base_url, title_element["href"]) +```js +// highlight-next-line +function parseProduct(productItem, baseURL) { + const title = productItem.find(".product-item__title"); + const titleText = title.text().trim(); + // highlight-next-line + const url = new URL(title.attr("href"), baseURL).href; - ... + ... - return {"title": title, "min_price": min_price, "price": price, "url": url} + return { url, title: titleText, ...priceRange }; +} ``` Now we'll pass the base URL to the function in the main body of our program: -```py -listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -listing_soup = download(listing_url) - -data = [] -for product in listing_soup.select(".product-item"): - # highlight-next-line - item = parse_product(product, listing_url) - data.append(item) +```js +const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales" +const $ = await download(listingURL); + +const data = [] +$(".product-item").each((i, element) => { + const productItem = $(element); + // highlight-next-line + const item = parseProduct(productItem, listingURL); + data.push(item); +}); ``` When we run the scraper now, we should see full URLs in our exports: @@ -319,16 +307,16 @@ When we run the scraper now, we should see full URLs in our exports: ```json title=products.json [ { + "url": "https://warehouse-theme-metal.myshopify.com/products/jbl-flip-4-waterproof-portable-bluetooth-speaker", "title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker", - "min_price": "74.95", - "price": "74.95", - "url": "https://warehouse-theme-metal.myshopify.com/products/jbl-flip-4-waterproof-portable-bluetooth-speaker" + "minPrice": 7495, + "price": 7495 }, { + "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv", "title": "Sony XBR-950G BRAVIA 4K HDR Ultra HD TV", - "min_price": "1398.00", - "price": null, - "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv" + "minPrice": 139800, + "price": null }, ... ] @@ -342,7 +330,7 @@ Ta-da! We've managed to get links leading to the product pages. In the next less ### Scrape links to countries in Africa -Download Wikipedia's page with the list of African countries, use Beautiful Soup to parse it, and print links to Wikipedia pages of all the states and territories mentioned in all tables. Start with this URL: +Download Wikipedia's page with the list of African countries, use Cheerio to parse it, and print links to Wikipedia pages of all the states and territories mentioned in all tables. Start with this URL: ```text https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa @@ -361,29 +349,32 @@ https://en.wikipedia.org/wiki/Botswana
Solution - ```py - import httpx - from bs4 import BeautifulSoup - from urllib.parse import urljoin - - listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" - response = httpx.get(listing_url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - - for name_cell in soup.select(".wikitable tr td:nth-child(3)"): - link = name_cell.select_one("a") - url = urljoin(listing_url, link["href"]) - print(url) + ```js + import * as cheerio from 'cheerio'; + + const listingURL = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"; + const response = await fetch(listingURL); + + if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + + $(".wikitable tr td:nth-child(3)").each((i, element) => { + const nameCell = $(element); + const link = nameCell.find("a").first(); + const url = new URL(link.attr("href"), listingURL).href; + console.log(url); + }); + } else { + throw new Error(`HTTP ${response.status}`); + } ```
### Scrape links to F1 news -Download Guardian's page with the latest F1 news, use Beautiful Soup to parse it, and print links to all the listed articles. Start with this URL: +Download Guardian's page with the latest F1 news, use Cheerio to parse it, and print links to all the listed articles. Start with this URL: ```text https://www.theguardian.com/sport/formulaone @@ -402,22 +393,24 @@ https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-u
Solution - ```py - import httpx - from bs4 import BeautifulSoup - from urllib.parse import urljoin + ```js + import * as cheerio from 'cheerio'; - url = "https://www.theguardian.com/sport/formulaone" - response = httpx.get(url) - response.raise_for_status() + const listingURL = "https://www.theguardian.com/sport/formulaone"; + const response = await fetch(listingURL); - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") + if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); - for item in soup.select("#maincontent ul li"): - link = item.select_one("a") - url = urljoin(url, link["href"]) - print(url) + $("#maincontent ul li").each((i, element) => { + const link = $(element).find("a").first(); + const url = new URL(link.attr("href"), listingURL).href; + console.log(url); + }); + } else { + throw new Error(`HTTP ${response.status}`); + } ``` Note that some cards contain two links. One leads to the article, and one to the comments. If we selected all the links in the list by `#maincontent ul li a`, we would get incorrect output like this: From 0c94e58b5418c5f971379fbef4ffb9c31d63410f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 30 Jun 2025 10:30:06 +0200 Subject: [PATCH 18/26] fix: make it clearer in saving data that we append more code --- .../webscraping/scraping_basics_javascript2/08_saving_data.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md index bfbb38369..332567b92 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md @@ -168,6 +168,9 @@ import { AsyncParser } from '@json2csv/node'; Then, let's add one more data export near the end of the source code of our scraper: ```js +const jsonData = JSON.stringify(data); +await writeFile('products.json', jsonData); + const parser = new AsyncParser(); const csvData = await parser.parse(data).promise(); await writeFile("products.csv", csvData); From ce9019054ba71352de3441444c167ef34689a308 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 30 Jun 2025 17:05:08 +0200 Subject: [PATCH 19/26] feat: change naming of JS variables, update crawling to be about JS --- .../06_locating_elements.md | 54 +++-- .../07_extracting_data.md | 66 +++--- .../08_saving_data.md | 54 ++--- .../09_getting_links.md | 92 ++++----- .../10_crawling.md | 194 ++++++++++-------- .../11_scraping_variants.md | 4 - .../12_framework.md | 2 - .../scraping_basics_python/08_saving_data.md | 4 +- .../scraping_basics_python/10_crawling.md | 8 +- .../11_scraping_variants.md | 4 - .../scraping_basics_python/12_framework.md | 2 - 11 files changed, 238 insertions(+), 246 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md index 21342a6eb..355a2725e 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md @@ -80,15 +80,15 @@ if (response.ok) { const $ = cheerio.load(html); $(".product-item").each((i, element) => { - const productItem = $(element); + const $productItem = $(element); - const title = productItem.find(".product-item__title"); - const titleText = title.text(); + const $title = $productItem.find(".product-item__title"); + const title = $title.text(); - const price = productItem.find(".price"); - const priceText = price.text(); + const $price = $productItem.find(".price"); + const price = $price.text(); - console.log(`${titleText} | ${priceText}`); + console.log(`${title} | ${price}`); }); } else { throw new Error(`HTTP ${response.status}`); @@ -170,16 +170,16 @@ if (response.ok) { const $ = cheerio.load(html); $(".product-item").each((i, element) => { - const productItem = $(element); + const $productItem = $(element); - const title = productItem.find(".product-item__title"); - const titleText = title.text(); + const $title = $productItem.find(".product-item__title"); + const title = $title.text(); // highlight-next-line - const price = productItem.find(".price").contents().last(); - const priceText = price.text(); + const $price = $productItem.find(".price").contents().last(); + const price = $price.text(); - console.log(`${titleText} | ${priceText}`); + console.log(`${title} | ${price}`); }); } else { throw new Error(`HTTP ${response.status}`); @@ -243,18 +243,17 @@ Djibouti const $ = cheerio.load(html); $(".wikitable").each((i, tableElement) => { - const table = $(tableElement); - const rows = table.find("tr"); - - rows.each((j, rowElement) => { - const row = $(rowElement); - const cells = row.find("td"); - - if (cells.length > 0) { - const thirdColumn = $(cells[2]); - const link = thirdColumn.find("a").first(); - const linkText = link.text(); - console.log(linkText); + const $table = $(tableElement); + const $rows = $table.find("tr"); + + $rows.each((j, rowElement) => { + const $row = $(rowElement); + const $cells = $row.find("td"); + + if ($cells.length > 0) { + const $thirdColumn = $($cells[2]); + const $link = $thirdColumn.find("a").first(); + console.log($link.text()); } }); }); @@ -289,10 +288,9 @@ Simplify the code from previous exercise. Use a single for loop and a single CSS const $ = cheerio.load(html); $(".wikitable tr td:nth-child(3)").each((i, element) => { - const nameCell = $(element); - const link = nameCell.find("a").first(); - const linkText = link.text(); - console.log(linkText); + const $nameCell = $(element); + const $link = $nameCell.find("a").first(); + console.log($link.text()); }); } else { throw new Error(`HTTP ${response.status}`); diff --git a/sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md b/sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md index 61239c7c0..142cceb97 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md @@ -36,14 +36,14 @@ It's because some products have variants with different prices. Later in the cou Ideally we'd go and discuss the problem with those who are about to use the resulting data. For their purposes, is the fact that some prices are just minimum prices important? What would be the most useful representation of the range for them? Maybe they'd tell us that it's okay if we just remove the `From` prefix? ```js -const priceText = price.text().replace("From ", ""); +const priceText = $price.text().replace("From ", ""); ``` In other cases, they'd tell us the data must include the range. And in cases when we just don't know, the safest option is to include all the information we have and leave the decision on what's important to later stages. One approach could be having the exact and minimum prices as separate values. If we don't know the exact price, we leave it empty: ```js const priceRange = { minPrice: null, price: null }; -const priceText = price.text() +const priceText = $price.text() if (priceText.startsWith("From ")) { priceRange.minPrice = priceText.replace("From ", ""); } else { @@ -71,14 +71,14 @@ if (response.ok) { const $ = cheerio.load(html); $(".product-item").each((i, element) => { - const productItem = $(element); + const $productItem = $(element); - const title = productItem.find(".product-item__title"); - const titleText = title.text(); + const $title = $productItem.find(".product-item__title"); + const title = $title.text(); - const price = productItem.find(".price").contents().last(); + const $price = $productItem.find(".price").contents().last(); const priceRange = { minPrice: null, price: null }; - const priceText = price.text(); + const priceText = $price.text(); if (priceText.startsWith("From ")) { priceRange.minPrice = priceText.replace("From ", ""); } else { @@ -86,7 +86,7 @@ if (response.ok) { priceRange.price = priceRange.minPrice; } - console.log(`${titleText} | ${priceRange.minPrice} | ${priceRange.price}`); + console.log(`${title} | ${priceRange.minPrice} | ${priceRange.price}`); }); } else { throw new Error(`HTTP ${response.status}`); @@ -100,9 +100,9 @@ Often, the strings we extract from a web page start or end with some amount of w We call the operation of removing whitespace _trimming_ or _stripping_, and it's so useful in many applications that programming languages and libraries include ready-made tools for it. Let's add JavaScript's built-in [.trim()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/trim): ```js -const titleText = title.text().trim(); +const title = $title.text().trim(); -const priceText = price.text().trim(); +const priceText = $price.text().trim(); ``` ## Removing dollar sign and commas @@ -124,7 +124,7 @@ The demonstration above is inside the Node.js' [interactive REPL](https://nodejs We need to remove the dollar sign and the decimal commas. For this type of cleaning, [regular expressions](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions) are often the best tool for the job, but in this case [`.replace()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/replace) is also sufficient: ```js -const priceText = price +const priceText = $price .text() .trim() .replace("$", "") @@ -137,7 +137,7 @@ Now we should be able to add `parseFloat()`, so that we have the prices not as a ```js const priceRange = { minPrice: null, price: null }; -const priceText = price.text() +const priceText = $price.text() if (priceText.startsWith("From ")) { priceRange.minPrice = parseFloat(priceText.replace("From ", "")); } else { @@ -156,7 +156,7 @@ Great! Only if we didn't overlook an important pitfall called [floating-point er These errors are small and usually don't matter, but sometimes they can add up and cause unpleasant discrepancies. That's why it's typically best to avoid floating point numbers when working with money. We won't store dollars, but cents: ```js -const priceText = price +const priceText = $price .text() .trim() .replace("$", "") @@ -178,14 +178,14 @@ if (response.ok) { const $ = cheerio.load(html); $(".product-item").each((i, element) => { - const productItem = $(element); + const $productItem = $(element); - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); + const $title = $productItem.find(".product-item__title"); + const titleText = $title.text().trim(); - const price = productItem.find(".price").contents().last(); + const $price = $productItem.find(".price").contents().last(); const priceRange = { minPrice: null, price: null }; - const priceText = price + const priceText = $price .text() .trim() .replace("$", "") @@ -199,7 +199,7 @@ if (response.ok) { priceRange.price = priceRange.minPrice; } - console.log(`${titleText} | ${priceRange.minPrice} | ${priceRange.price}`); + console.log(`${title} | ${priceRange.minPrice} | ${priceRange.price}`); }); } else { throw new Error(`HTTP ${response.status}`); @@ -249,12 +249,12 @@ Denon AH-C720 In-Ear Headphones | 236 const $ = cheerio.load(html); $(".product-item").each((i, element) => { - const productItem = $(element); + const $productItem = $(element); - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); + const title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); - const unitsText = productItem + const unitsText = $productItem .find(".product-item__inventory") .text() .replace("In stock,", "") @@ -265,7 +265,7 @@ Denon AH-C720 In-Ear Headphones | 236 const unitsCount = unitsText === "Sold out" ? 0 : parseInt(unitsText); - console.log(`${titleText} | ${unitsCount}`); + console.log(`${title} | ${unitsCount}`); }); } else { throw new Error(`HTTP ${response.status}`); @@ -298,19 +298,19 @@ Simplify the code from previous exercise. Use [regular expressions](https://deve const $ = cheerio.load(html); $(".product-item").each((i, element) => { - const productItem = $(element); + const $productItem = $(element); - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); - const unitsText = productItem + const unitsText = $productItem .find(".product-item__inventory") .text() .trim(); const unitsCount = unitsText === "Sold out" ? 0 : parseInt(unitsText.match(/\d+/)); - console.log(`${titleText} | ${unitsCount}`); + console.log(`${title} | ${unitsCount}`); }); } else { throw new Error(`HTTP ${response.status}`); @@ -364,19 +364,19 @@ Hints: const $ = cheerio.load(html); $("#maincontent ul li").each((i, element) => { - const article = $(element); + const $article = $(element); - const titleText = article + const title = $article .find("h3") .text() .trim(); - const dateText = article + const dateText = $article .find("time") .attr("datetime") .trim(); const date = new Date(dateText); - console.log(`${titleText} | ${date.toDateString()}`); + console.log(`${title} | ${date.toDateString()}`); }); } else { throw new Error(`HTTP ${response.status}`); diff --git a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md index 332567b92..b5aca4862 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md @@ -25,7 +25,7 @@ We should use widely popular formats that have well-defined solutions for all th ## Collecting data -Producing results line by line is an efficient approach to handling large datasets, but to simplify this lesson, we'll store all our data in one variable. This'll take three changes to our program: +Producing results line by line is an efficient approach to handling large datasets, but to simplify this lesson, we'll store all our data in one variable. This'll take four changes to our program: ```js import * as cheerio from 'cheerio'; @@ -38,16 +38,15 @@ if (response.ok) { const $ = cheerio.load(html); // highlight-next-line - const data = []; - $(".product-item").each((i, element) => { - const productItem = $(element); + const $items = $(".product-item").map((i, element) => { + const $productItem = $(element); - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); - const price = productItem.find(".price").contents().last(); + const $price = $productItem.find(".price").contents().last(); const priceRange = { minPrice: null, price: null }; - const priceText = price + const priceText = $price .text() .trim() .replace("$", "") @@ -62,9 +61,10 @@ if (response.ok) { } // highlight-next-line - data.push({ title: titleText, ...priceRange }); + return { title, ...priceRange }; }); - + // highlight-next-line + const data = $items.get(); // highlight-next-line console.log(data); } else { @@ -72,7 +72,23 @@ if (response.ok) { } ``` -Before looping over the products, we prepare an empty array. Then, instead of printing each line, we append the data of each product to the array in the form of a JavaScript object. At the end of the program, we print the entire array at once. +Instead of printing each line, we now return the data for each product as a JavaScript object. We've replaced `.each()` with [`.map()`](https://cheerio.js.org/docs/api/classes/Cheerio#map-3), which also iterates over the selection but, in addition, collects all the results and returns them as a Cheerio collection. We then convert it into a standard JavaScript array by calling [`.get()`](https://cheerio.js.org/docs/api/classes/Cheerio#call-signature-32). Near the end of the program, we print the entire array. + +:::tip Advanced syntax + +When returning the item object, we use [shorthand property syntax](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Object_initializer#property_definitions) to set the title, and [spread syntax](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Spread_syntax) to set the prices. It's the same as if we wrote the following: + +```js +{ + title: title, + minPrice: priceRange.minPrice, + price: priceRange.price, +} +``` + +::: + +The program should now print the results as a single large JavaScript array: ```text $ node index.js @@ -91,20 +107,6 @@ $ node index.js ] ``` -:::tip Spread syntax - -The three dots in `{ title: titleText, ...priceRange }` are called [spread syntax](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Spread_syntax). It's the same as if we wrote the following: - -```js -{ - title: titleText, - minPrice: priceRange.minPrice, - price: priceRange.price, -} -``` - -::: - ## Saving data as JSON The JSON format is popular primarily among developers. We use it for storing data, configuration files, or as a way to transfer data between programs (e.g., APIs). Its origin stems from the syntax of JavaScript objects, but people now use it accross programming languages. @@ -202,7 +204,7 @@ In this lesson, we created export files in two formats. The following challenges ### Process your JSON -Write a new Node.js program that reads `products.json`, finds all products with a min price greater than $500, and prints each of them. +Write a new Node.js program that reads the `products.json` file we created in the lesson, finds all products with a min price greater than $500, and prints each of them.
Solution diff --git a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md index 288ef339a..027acdad5 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md @@ -43,16 +43,15 @@ if (response.ok) { const html = await response.text(); const $ = cheerio.load(html); - const data = []; - $(".product-item").each((i, element) => { - const productItem = $(element); + const $items = $(".product-item").map((i, element) => { + const $productItem = $(element); - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); - const price = productItem.find(".price").contents().last(); + const $price = $productItem.find(".price").contents().last(); const priceRange = { minPrice: null, price: null }; - const priceText = price + const priceText = $price .text() .trim() .replace("$", "") @@ -66,8 +65,9 @@ if (response.ok) { priceRange.price = priceRange.minPrice; } - data.push({ title: titleText, ...priceRange }); + return { title, ...priceRange }; }); + const data = $items.get(); const jsonData = JSON.stringify(data); await writeFile('products.json', jsonData); @@ -97,13 +97,13 @@ async function download(url) { Next, we can put parsing into a `parseProduct()` function, which takes the product item element and returns the object with data: ```js -function parseProduct(productItem) { - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); +function parseProduct($productItem) { + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); - const price = productItem.find(".price").contents().last(); + const $price = $productItem.find(".price").contents().last(); const priceRange = { minPrice: null, price: null }; - const priceText = price + const priceText = $price .text() .trim() .replace("$", "") @@ -117,24 +117,18 @@ function parseProduct(productItem) { priceRange.price = priceRange.minPrice; } - return { title: titleText, ...priceRange }; + return { title, ...priceRange }; } ``` Now the JSON export. For better readability of it, let's make a small change here and set the indentation level to two spaces: ```js -async function exportJSON(data) { +function exportJSON(data) { return JSON.stringify(data, null, 2); } ``` -:::note Why asynchronous? - -The `exportJSON()` function doesn't need to be `async` now, but keeping it makes future changes easier — like switching to an async JSON parser. It also stays consistent with the upcoming `exportCSV()` function, which must be asynchronous. - -::: - The last function we'll add will take care of the CSV export: ```js @@ -161,13 +155,13 @@ async function download(url) { } } -function parseProduct(productItem) { - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); +function parseProduct($productItem) { + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); - const price = productItem.find(".price").contents().last(); + const $price = $productItem.find(".price").contents().last(); const priceRange = { minPrice: null, price: null }; - const priceText = price + const priceText = $price .text() .trim() .replace("$", "") @@ -181,10 +175,10 @@ function parseProduct(productItem) { priceRange.price = priceRange.minPrice; } - return { title: titleText, ...priceRange }; + return { title, ...priceRange }; } -async function exportJSON(data) { +function exportJSON(data) { return JSON.stringify(data, null, 2); } @@ -196,14 +190,14 @@ async function exportCSV(data) { const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales" const $ = await download(listingURL); -const data = [] -$(".product-item").each((i, element) => { - const productItem = $(element); - const item = parseProduct(productItem); - data.push(item); +const $items = $(".product-item").map((i, element) => { + const $productItem = $(element); + const item = parseProduct($productItem); + return item; }); +const data = $items.get(); -await writeFile('products.json', await exportJSON(data)); +await writeFile('products.json', exportJSON(data)); await writeFile('products.csv', await exportCSV(data)); ``` @@ -232,14 +226,14 @@ Several methods exist for transitioning from one page to another, but the most c In DevTools, we can see that each product title is, in fact, also a link element. We already locate the titles, so that makes our task easier. We just need to edit the code so that it extracts not only the text of the element but also the `href` attribute. Cheerio selections support accessing attributes using the `.attr()` method: ```js -function parseProduct(productItem) { - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); - const url = title.attr("href"); +function parseProduct($productItem) { + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); + const url = $title.attr("href"); ... - return { url, title: titleText, ...priceRange }; + return { url, title, ...priceRange }; } ``` @@ -274,15 +268,15 @@ We'll change the `parseProduct()` function so that it also takes the base URL as ```js // highlight-next-line -function parseProduct(productItem, baseURL) { - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); +function parseProduct($productItem, baseURL) { + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); // highlight-next-line - const url = new URL(title.attr("href"), baseURL).href; + const url = new URL($title.attr("href"), baseURL).href; ... - return { url, title: titleText, ...priceRange }; + return { url, title, ...priceRange }; } ``` @@ -292,13 +286,13 @@ Now we'll pass the base URL to the function in the main body of our program: const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales" const $ = await download(listingURL); -const data = [] -$(".product-item").each((i, element) => { - const productItem = $(element); +const $items = $(".product-item").map((i, element) => { + const $productItem = $(element); // highlight-next-line - const item = parseProduct(productItem, listingURL); - data.push(item); + const item = parseProduct($productItem, listingURL); + return item; }); +const data = $items.get(); ``` When we run the scraper now, we should see full URLs in our exports: diff --git a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md index 98d47b54e..79f6e8ed0 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md @@ -12,75 +12,71 @@ import Exercises from './_exercises.mdx'; --- -In previous lessons we've managed to download the HTML code of a single page, parse it with BeautifulSoup, and extract relevant data from it. We'll do the same now for each of the products. +In previous lessons we've managed to download the HTML code of a single page, parse it with Cheerio, and extract relevant data from it. We'll do the same now for each of the products. Thanks to the refactoring, we have functions ready for each of the tasks, so we won't need to repeat ourselves in our code. This is what you should see in your editor now: -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal -import json -import csv -from urllib.parse import urljoin - -def download(url): - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - return BeautifulSoup(html_code, "html.parser") - -def parse_product(product, base_url): - title_element = product.select_one(".product-item__title") - title = title_element.text.strip() - url = urljoin(base_url, title_element["href"]) - - price_text = ( - product - .select_one(".price") - .contents[-1] - .strip() - .replace("$", "") - .replace(",", "") - ) - if price_text.startswith("From "): - min_price = Decimal(price_text.removeprefix("From ")) - price = None - else: - min_price = Decimal(price_text) - price = min_price - - return {"title": title, "min_price": min_price, "price": price, "url": url} - -def export_csv(file, data): - fieldnames = list(data[0].keys()) - writer = csv.DictWriter(file, fieldnames=fieldnames) - writer.writeheader() - for row in data: - writer.writerow(row) - -def export_json(file, data): - def serialize(obj): - if isinstance(obj, Decimal): - return str(obj) - raise TypeError("Object not JSON serializable") - - json.dump(data, file, default=serialize, indent=2) - -listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -listing_soup = download(listing_url) - -data = [] -for product in listing_soup.select(".product-item"): - item = parse_product(product, listing_url) - data.append(item) - -with open("products.csv", "w") as file: - export_csv(file, data) - -with open("products.json", "w") as file: - export_json(file, data) +```js +import * as cheerio from 'cheerio'; +import { writeFile } from 'fs/promises'; +import { AsyncParser } from '@json2csv/node'; + +async function download(url) { + const response = await fetch(url); + if (response.ok) { + const html = await response.text(); + return cheerio.load(html); + } else { + throw new Error(`HTTP ${response.status}`); + } +} + +function parseProduct(productItem, baseURL) { + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); + const url = new URL($title.attr("href"), baseURL).href; + + const $price = $productItem.find(".price").contents().last(); + const priceRange = { minPrice: null, price: null }; + const priceText = $price + .text() + .trim() + .replace("$", "") + .replace(".", "") + .replace(",", ""); + + if (priceText.startsWith("From ")) { + priceRange.minPrice = parseInt(priceText.replace("From ", "")); + } else { + priceRange.minPrice = parseInt(priceText); + priceRange.price = priceRange.minPrice; + } + + return { url, title, ...priceRange }; +} + +function exportJSON(data) { + return JSON.stringify(data, null, 2); +} + +async function exportCSV(data) { + const parser = new AsyncParser(); + return await parser.parse(data).promise(); +} + +const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales" +const $ = await download(listingURL); + +const $items = $(".product-item").map((i, element) => { + const $productItem = $(element); + // highlight-next-line + const item = parseProduct($productItem, listingURL); + return item; +}); +const data = $items.get(); + +await writeFile('products.json', exportJSON(data)); +await writeFile('products.csv', await exportCSV(data)); ``` ## Extracting vendor name @@ -125,51 +121,69 @@ Depending on what's valuable for our use case, we can now use the same technique It looks like using a CSS selector to locate the element with the `product-meta__vendor` class, and then extracting its text, should be enough to get the vendor name as a string: -```py -vendor = product_soup.select_one(".product-meta__vendor").text.strip() +```js +const vendor = $(".product-meta__vendor").text().trim(); ``` But where do we put this line in our program? ## Crawling product detail pages -In the `data` loop we're already going through all the products. Let's expand it to include downloading the product detail page, parsing it, extracting the vendor's name, and adding it as a new key in the item's dictionary: +In the `.map()` loop, we're already going through all the products. Let's expand it to include downloading the product detail page, parsing it, extracting the vendor's name, and adding it to the item object. -```py -... +First, we need to make the loop asynchronous so that we can use `await download()` for each product. We'll add the `async` keyword to the inner function and rename the collection to `$promises`, since it will now store promises that resolve to items rather than the items themselves. We'll still convert the collection to a standard JavaScript array, but this time we'll pass it to `await Promise.all()` to resolve all the promises and retrieve the actual items. -listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -listing_soup = download(listing_url) +```js +const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales" +const $ = await download(listingURL); -data = [] -for product in listing_soup.select(".product-item"): - item = parse_product(product, listing_url) - # highlight-next-line - product_soup = download(item["url"]) - # highlight-next-line - item["vendor"] = product_soup.select_one(".product-meta__vendor").text.strip() - data.append(item) +// highlight-next-line +const $promises = $(".product-item").map(async (i, element) => { + const $productItem = $(element); + const item = parseProduct($productItem, listingURL); + return item; +}); +// highlight-next-line +const data = await Promise.all($promises.get()); +``` -... +The program behaves the same as before, but now the code is prepared to make HTTP requests from within the inner function. Let's do it: + +```js +const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales" +const $ = await download(listingURL); + +const $promises = $(".product-item").map(async (i, element) => { + const $productItem = $(element); + const item = parseProduct($productItem, listingURL); + // highlight-next-line + const $p = await download(item.url); + // highlight-next-line + item.vendor = $p(".product-meta__vendor").text().trim(); + return item; +}); +const data = await Promise.all($promises.get()); ``` +We download each product detail page and parse its HTML using Cheerio. The `$p` variable is the root of a Cheerio object tree, similar to but distinct from the `$` used for the listing page. That's why we use `$p()` instead of `$p.find()`. + If we run the program now, it'll take longer to finish since it's making 24 more HTTP requests. But in the end, it should produce exports with a new field containing the vendor's name: ```json title=products.json [ { - "title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker", - "min_price": "74.95", - "price": "74.95", "url": "https://warehouse-theme-metal.myshopify.com/products/jbl-flip-4-waterproof-portable-bluetooth-speaker", + "title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker", + "minPrice": 7495, + "price": 7495, "vendor": "JBL" }, { + "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv", "title": "Sony XBR-950G BRAVIA 4K HDR Ultra HD TV", - "min_price": "1398.00", + "minPrice": 139800, "price": null, - "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv", "vendor": "Sony" }, ... @@ -178,7 +192,7 @@ If we run the program now, it'll take longer to finish since it's making 24 more ## Extracting price -Scraping the vendor's name is nice, but the main reason we started checking the detail pages in the first place was to figure out how to get a price for each product. From the product listing, we could only scrape the min price, and remember—we’re building a Python app to track prices! +Scraping the vendor's name is nice, but the main reason we started checking the detail pages in the first place was to figure out how to get a price for each product. From the product listing, we could only scrape the min price, and remember—we're building a Node.js app to track prices! Looking at the [Sony XBR-950G BRAVIA](https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv), it's clear that the listing only shows min prices, because some products have variants, each with a different price. And different stock availability. And different SKUs… @@ -206,12 +220,12 @@ https://en.wikipedia.org/wiki/Angola +244 https://en.wikipedia.org/wiki/Benin +229 https://en.wikipedia.org/wiki/Botswana +267 https://en.wikipedia.org/wiki/Burkina_Faso +226 -https://en.wikipedia.org/wiki/Burundi None +https://en.wikipedia.org/wiki/Burundi null https://en.wikipedia.org/wiki/Cameroon +237 ... ``` -Hint: Locating cells in tables is sometimes easier if you know how to [navigate up](https://beautiful-soup-4.readthedocs.io/en/latest/index.html#going-up) in the HTML element soup. +Hint: Locating cells in tables is sometimes easier if you know how to [navigate up](https://cheerio.js.org/docs/api/classes/Cheerio#parent) in the HTML element tree.
Solution diff --git a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md index 6cebba658..414ac82f0 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md @@ -72,8 +72,6 @@ These elements aren't visible to regular visitors. They're there just in case Ja Using our knowledge of Beautiful Soup, we can locate the options and extract the data we need: ```py -... - listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales" listing_soup = download(listing_url) @@ -89,8 +87,6 @@ for product in listing_soup.select(".product-item"): else: item["variant_name"] = None data.append(item) - -... ``` The CSS selector `.product-form__option.no-js` matches elements with both `product-form__option` and `no-js` classes. Then we're using the [descendant combinator](https://developer.mozilla.org/en-US/docs/Web/CSS/Descendant_combinator) to match all `option` elements somewhere inside the `.product-form__option.no-js` wrapper. diff --git a/sources/academy/webscraping/scraping_basics_javascript2/12_framework.md b/sources/academy/webscraping/scraping_basics_javascript2/12_framework.md index fe80fb5fc..ae1abb53f 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/12_framework.md @@ -534,7 +534,6 @@ If you export the dataset as JSON, it should look something like this: To scrape IMDb data, you'll need to construct a `Request` object with the appropriate search URL for each movie title. The following code snippet gives you an idea of how to do this: ```py -... from urllib.parse import quote_plus async def main(): @@ -550,7 +549,6 @@ async def main(): await context.add_requests(requests) ... -... ``` When navigating to the first search result, you might find it helpful to know that `context.enqueue_links()` accepts a `limit` keyword argument, letting you specify the max number of HTTP requests to enqueue. diff --git a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md index 6567e24ef..c5140e8d1 100644 --- a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md @@ -65,7 +65,7 @@ for product in soup.select(".product-item"): print(data) ``` -Before looping over the products, we prepare an empty list. Then, instead of printing each line, we append the data of each product to the list in the form of a Python dictionary. At the end of the program, we print the entire list at once. +Before looping over the products, we prepare an empty list. Then, instead of printing each line, we append the data of each product to the list in the form of a Python dictionary. At the end of the program, we print the entire list. The program should now print the results as a single large Python list: ```text $ python main.py @@ -215,7 +215,7 @@ In this lesson, we created export files in two formats. The following challenges ### Process your JSON -Write a new Python program that reads `products.json`, finds all products with a min price greater than $500, and prints each one using [`pp()`](https://docs.python.org/3/library/pprint.html#pprint.pp). +Write a new Python program that reads the `products.json` file we created in the lesson, finds all products with a min price greater than $500, and prints each one using [`pp()`](https://docs.python.org/3/library/pprint.html#pprint.pp).
Solution diff --git a/sources/academy/webscraping/scraping_basics_python/10_crawling.md b/sources/academy/webscraping/scraping_basics_python/10_crawling.md index dc4d8cee2..84cae621e 100644 --- a/sources/academy/webscraping/scraping_basics_python/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_python/10_crawling.md @@ -125,7 +125,7 @@ Depending on what's valuable for our use case, we can now use the same technique It looks like using a CSS selector to locate the element with the `product-meta__vendor` class, and then extracting its text, should be enough to get the vendor name as a string: ```py -vendor = product_soup.select_one(".product-meta__vendor").text.strip() +vendor = soup.select_one(".product-meta__vendor").text.strip() ``` But where do we put this line in our program? @@ -135,8 +135,6 @@ But where do we put this line in our program? In the `data` loop we're already going through all the products. Let's expand it to include downloading the product detail page, parsing it, extracting the vendor's name, and adding it as a new key in the item's dictionary: ```py -... - listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales" listing_soup = download(listing_url) @@ -148,8 +146,6 @@ for product in listing_soup.select(".product-item"): # highlight-next-line item["vendor"] = product_soup.select_one(".product-meta__vendor").text.strip() data.append(item) - -... ``` If we run the program now, it'll take longer to finish since it's making 24 more HTTP requests. But in the end, it should produce exports with a new field containing the vendor's name: @@ -177,7 +173,7 @@ If we run the program now, it'll take longer to finish since it's making 24 more ## Extracting price -Scraping the vendor's name is nice, but the main reason we started checking the detail pages in the first place was to figure out how to get a price for each product. From the product listing, we could only scrape the min price, and remember—we’re building a Python app to track prices! +Scraping the vendor's name is nice, but the main reason we started checking the detail pages in the first place was to figure out how to get a price for each product. From the product listing, we could only scrape the min price, and remember—we're building a Python app to track prices! Looking at the [Sony XBR-950G BRAVIA](https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv), it's clear that the listing only shows min prices, because some products have variants, each with a different price. And different stock availability. And different SKUs… diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md index 2d8b9e822..7c799759e 100644 --- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md @@ -71,8 +71,6 @@ These elements aren't visible to regular visitors. They're there just in case Ja Using our knowledge of Beautiful Soup, we can locate the options and extract the data we need: ```py -... - listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales" listing_soup = download(listing_url) @@ -88,8 +86,6 @@ for product in listing_soup.select(".product-item"): else: item["variant_name"] = None data.append(item) - -... ``` The CSS selector `.product-form__option.no-js` matches elements with both `product-form__option` and `no-js` classes. Then we're using the [descendant combinator](https://developer.mozilla.org/en-US/docs/Web/CSS/Descendant_combinator) to match all `option` elements somewhere inside the `.product-form__option.no-js` wrapper. diff --git a/sources/academy/webscraping/scraping_basics_python/12_framework.md b/sources/academy/webscraping/scraping_basics_python/12_framework.md index c8b5f6468..63be4cf61 100644 --- a/sources/academy/webscraping/scraping_basics_python/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_python/12_framework.md @@ -533,7 +533,6 @@ If you export the dataset as JSON, it should look something like this: To scrape IMDb data, you'll need to construct a `Request` object with the appropriate search URL for each movie title. The following code snippet gives you an idea of how to do this: ```py -... from urllib.parse import quote_plus async def main(): @@ -549,7 +548,6 @@ async def main(): await context.add_requests(requests) ... -... ``` When navigating to the first search result, you might find it helpful to know that `context.enqueue_links()` accepts a `limit` keyword argument, letting you specify the max number of HTTP requests to enqueue. From 7b3a1f7fe8792867159bd9e792f8e6863b7f16e3 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 30 Jun 2025 18:48:00 +0200 Subject: [PATCH 20/26] feat: update crawling exercises to be about JS --- .../10_crawling.md | 124 ++++++++++-------- .../scraping_basics_python/10_crawling.md | 6 +- 2 files changed, 71 insertions(+), 59 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md index 79f6e8ed0..52cc1f4ac 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md @@ -206,7 +206,7 @@ In the next lesson, we'll scrape the product detail pages so that each product v ### Scrape calling codes of African countries -This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL: +Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL: ```text https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa @@ -225,43 +225,53 @@ https://en.wikipedia.org/wiki/Cameroon +237 ... ``` -Hint: Locating cells in tables is sometimes easier if you know how to [navigate up](https://cheerio.js.org/docs/api/classes/Cheerio#parent) in the HTML element tree. +Hint: Locating cells in tables is sometimes easier if you know how to [filter](https://cheerio.js.org/docs/api/classes/Cheerio#filter) or [navigate up](https://cheerio.js.org/docs/api/classes/Cheerio#parent) in the HTML element tree.
Solution - ```py - import httpx - from bs4 import BeautifulSoup - from urllib.parse import urljoin - - def download(url): - response = httpx.get(url) - response.raise_for_status() - return BeautifulSoup(response.text, "html.parser") - - def parse_calling_code(soup): - for label in soup.select("th.infobox-label"): - if label.text.strip() == "Calling code": - data = label.parent.select_one("td.infobox-data") - return data.text.strip() - return None - - listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" - listing_soup = download(listing_url) - for name_cell in listing_soup.select(".wikitable tr td:nth-child(3)"): - link = name_cell.select_one("a") - country_url = urljoin(listing_url, link["href"]) - country_soup = download(country_url) - calling_code = parse_calling_code(country_soup) - print(country_url, calling_code) + ```js + import * as cheerio from 'cheerio'; + + async function download(url) { + const response = await fetch(url); + if (response.ok) { + const html = await response.text(); + return cheerio.load(html); + } else { + throw new Error(`HTTP ${response.status}`); + } + } + + const listingURL = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"; + const $ = await download(listingURL); + + const $promises = $(".wikitable tr td:nth-child(3)").map(async (i, element) => { + const $nameCell = $(element); + const $link = $nameCell.find("a").first(); + const countryURL = new URL($link.attr("href"), listingURL).href; + + const $c = await download(countryURL); + const $label = $c("th.infobox-label") + .filter((i, element) => $c(element).text().trim() == "Calling code") + .first(); + const callingCode = $label + .parent() + .find("td.infobox-data") + .first() + .text() + .trim(); + + console.log(`${countryURL} ${callingCode || null}`); + }); + await Promise.all($promises.get()); ```
### Scrape authors of F1 news articles -This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL: +Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL: ```text https://www.theguardian.com/sport/formulaone @@ -286,34 +296,36 @@ Hints:
Solution - ```py - import httpx - from bs4 import BeautifulSoup - from urllib.parse import urljoin - - def download(url): - response = httpx.get(url) - response.raise_for_status() - return BeautifulSoup(response.text, "html.parser") - - def parse_author(article_soup): - link = article_soup.select_one('aside a[rel="author"]') - if link: - return link.text.strip() - address = article_soup.select_one('aside address') - if address: - return address.text.strip() - return None - - listing_url = "https://www.theguardian.com/sport/formulaone" - listing_soup = download(listing_url) - for item in listing_soup.select("#maincontent ul li"): - link = item.select_one("a") - article_url = urljoin(listing_url, link["href"]) - article_soup = download(article_url) - title = article_soup.select_one("h1").text.strip() - author = parse_author(article_soup) - print(f"{author}: {title}") + ```js + import * as cheerio from 'cheerio'; + + async function download(url) { + const response = await fetch(url); + if (response.ok) { + const html = await response.text(); + return cheerio.load(html); + } else { + throw new Error(`HTTP ${response.status}`); + } + } + + const listingURL = "https://www.theguardian.com/sport/formulaone"; + const $ = await download(listingURL); + + const $promises = $("#maincontent ul li").map(async (i, element) => { + const $item = $(element); + const $link = $item.find("a").first(); + const authorURL = new URL($link.attr("href"), listingURL).href; + + const $a = await download(authorURL); + const title = $a("h1").text().trim(); + + const author = $a('a[rel="author"]').text().trim(); + const address = $a('aside address').text().trim(); + + console.log(`${author || address || null}: ${title}`); + }); + await Promise.all($promises.get()); ```
diff --git a/sources/academy/webscraping/scraping_basics_python/10_crawling.md b/sources/academy/webscraping/scraping_basics_python/10_crawling.md index 84cae621e..90bbf8e19 100644 --- a/sources/academy/webscraping/scraping_basics_python/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_python/10_crawling.md @@ -187,7 +187,7 @@ In the next lesson, we'll scrape the product detail pages so that each product v ### Scrape calling codes of African countries -This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL: +Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL: ```text https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa @@ -242,7 +242,7 @@ Hint: Locating cells in tables is sometimes easier if you know how to [navigate ### Scrape authors of F1 news articles -This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL: +Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL: ```text https://www.theguardian.com/sport/formulaone @@ -278,7 +278,7 @@ Hints: return BeautifulSoup(response.text, "html.parser") def parse_author(article_soup): - link = article_soup.select_one('aside a[rel="author"]') + link = article_soup.select_one('a[rel="author"]') if link: return link.text.strip() address = article_soup.select_one('aside address') From 4566f8dff2044b01a5b2c4a6954fbcb535ffa8cb Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 1 Jul 2025 09:50:31 +0200 Subject: [PATCH 21/26] feat: explain dollar sign variable names --- .../scraping_basics_javascript2/06_locating_elements.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md index 355a2725e..25b3382f9 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md @@ -108,6 +108,12 @@ Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | There's still some room for improvement, but it's already much better! +:::info Dollar sign variable names + +In jQuery and Cheerio, the core idea is a collection that wraps selected objects, usually HTML elements. To tell these wrapped selections apart from plain arrays, strings or other objects, it's common to start variable names with a dollar sign. This is just a naming convention to improve readability. The dollar sign has no special meaning and works like any other character in a variable name. + +::: + ## Precisely locating price In the output we can see that the price isn't located precisely. For each product, our scraper also prints the text `Sale price`. Let's look at the HTML structure again. Each bit containing the price looks like this: From 16715fac77277efc70d3078fd9b6f8d17bd2fcd0 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 1 Jul 2025 11:23:30 +0200 Subject: [PATCH 22/26] style: better readability for a code example in crawling lesson --- .../webscraping/scraping_basics_javascript2/10_crawling.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md index 52cc1f4ac..579fe3212 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md @@ -156,10 +156,12 @@ const $ = await download(listingURL); const $promises = $(".product-item").map(async (i, element) => { const $productItem = $(element); const item = parseProduct($productItem, listingURL); + // highlight-next-line const $p = await download(item.url); // highlight-next-line item.vendor = $p(".product-meta__vendor").text().trim(); + return item; }); const data = await Promise.all($promises.get()); From ce403aceee99e90677a615c8555090e3373d10e8 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 1 Jul 2025 11:24:21 +0200 Subject: [PATCH 23/26] feat: update first half of scraping variants to be about JS --- .../11_scraping_variants.md | 106 ++++++++++++------ .../11_scraping_variants.md | 2 +- 2 files changed, 70 insertions(+), 38 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md index 414ac82f0..db8fc4049 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md @@ -41,7 +41,7 @@ Nice! We can extract the variant names, but we also need to extract the price fo ![Switching variants](images/variants-js.gif) -If we can't find a workaround, we'd need our scraper to run JavaScript. That's not impossible. Scrapers can spin up their own browser instance and automate clicking on buttons, but it's slow and resource-intensive. Ideally, we want to stick to plain HTTP requests and Beautiful Soup as much as possible. +If we can't find a workaround, we'd need our scraper to run browser JavaScript. That's not impossible. Scrapers can spin up their own browser instance and automate clicking on buttons, but it's slow and resource-intensive. Ideally, we want to stick to plain HTTP requests and Cheerio as much as possible. After a bit of detective work, we notice that not far below the `block-swatch-list` there's also a block of HTML with a class `no-js`, which contains all the data! @@ -65,41 +65,73 @@ After a bit of detective work, we notice that not far below the `block-swatch-li ``` -These elements aren't visible to regular visitors. They're there just in case JavaScript fails to work, otherwise they're hidden. This is a great find because it allows us to keep our scraper lightweight. +These elements aren't visible to regular visitors. They're there just in case browser JavaScript fails to work, otherwise they're hidden. This is a great find because it allows us to keep our scraper lightweight. ## Extracting variants -Using our knowledge of Beautiful Soup, we can locate the options and extract the data we need: +Using our knowledge of Cheerio, we can locate the `option` elements and extract the data we need. We'll loop over the options, extract variant names, and create a corresponding array of items for each product: -```py -listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -listing_soup = download(listing_url) +```js +const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales" +const $ = await download(listingURL); -data = [] -for product in listing_soup.select(".product-item"): - item = parse_product(product, listing_url) - product_soup = download(item["url"]) - vendor = product_soup.select_one(".product-meta__vendor").text.strip() +const $promises = $(".product-item").map(async (i, element) => { + const $productItem = $(element); + const item = parseProduct($productItem, listingURL); - if variants := product_soup.select(".product-form__option.no-js option"): - for variant in variants: - data.append(item | {"variant_name": variant.text.strip()}) - else: - item["variant_name"] = None - data.append(item) + const $p = await download(item.url); + item.vendor = $p(".product-meta__vendor").text().trim(); + + // highlight-start + const $items = $p(".product-form__option.no-js option").map((j, element) => { + const $option = $(element); + const variantName = $option.text().trim(); + return { variantName, ...item }; + }); + // highlight-end + + return item; +}); +const data = await Promise.all($promises.get()); ``` -The CSS selector `.product-form__option.no-js` matches elements with both `product-form__option` and `no-js` classes. Then we're using the [descendant combinator](https://developer.mozilla.org/en-US/docs/Web/CSS/Descendant_combinator) to match all `option` elements somewhere inside the `.product-form__option.no-js` wrapper. +The CSS selector `.product-form__option.no-js` targets elements that have both the `product-form__option` and `no-js` classes. We then use the [descendant combinator](https://developer.mozilla.org/en-US/docs/Web/CSS/Descendant_combinator) to match all `option` elements nested within the `.product-form__option.no-js` wrapper. -Python dictionaries are mutable, so if we assigned the variant with `item["variant_name"] = ...`, we'd always overwrite the values. Instead of saving an item for each variant, we'd end up with the last variant repeated several times. To avoid this, we create a new dictionary for each variant and merge it with the `item` data before adding it to `data`. If we don't find any variants, we add the `item` as is, leaving the `variant_name` key empty. +We loop over the variants using Cheerio's `.map()` method to create a collection of item copies for each `variantName`. We now need to pass all these items onward, but the function currently returns just one item per product. And what if there are no variants? -:::tip Modern Python syntax +Let's adjust the loop so it returns a promise that resolves to an array of items instead of a single item. If a product has no variants, we'll return an array with a single item, setting `variantName` to `null`: -Since Python 3.8, you can use `:=` to simplify checking if an assignment resulted in a non-empty value. It's called an _assignment expression_ or _walrus operator_. You can learn more about it in the [docs](https://docs.python.org/3/reference/expressions.html#assignment-expressions) or in the [proposal document](https://peps.python.org/pep-0572/). +```js +const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales" +const $ = await download(listingURL); -Since Python 3.9, you can use `|` to merge two dictionaries. If the [docs](https://docs.python.org/3/library/stdtypes.html#dict) aren't clear enough, check out the [proposal document](https://peps.python.org/pep-0584/) for more details. +const $promises = $(".product-item").map(async (i, element) => { + const $productItem = $(element); + const item = parseProduct($productItem, listingURL); -::: + const $p = await download(item.url); + item.vendor = $p(".product-meta__vendor").text().trim(); + + const $items = $p(".product-form__option.no-js option").map((j, element) => { + const $option = $(element); + const variantName = $option.text().trim(); + return { variantName, ...item }; + }); + + // highlight-start + if ($items.length > 0) { + return $items.get(); + } + return [{ variantName: null, ...item }]; + // highlight-end +}); +// highlight-start +const itemLists = await Promise.all($promises.get()); +const data = itemLists.flat(); +// highlight-end +``` + +After modifying the loop, we also updated how we collect the items into the `data` array. Since the loop now produces an array of items per product, the result of `await Promise.all()` is an array of arrays. We use [`.flat()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/flat) to merge them into a single, non-nested array. If we run the program now, we'll see 34 items in total. Some items don't have variants, so they won't have a variant name. However, they should still have a price set—our scraper should already have that info from the product listing page. @@ -108,11 +140,11 @@ If we run the program now, we'll see 34 items in total. Some items don't have va [ ... { - "variant_name": null, - "title": "Klipsch R-120SW Powerful Detailed Home Speaker - Unit", - "min_price": "324.00", - "price": "324.00", + "variant": null, "url": "https://warehouse-theme-metal.myshopify.com/products/klipsch-r-120sw-powerful-detailed-home-speaker-set-of-1", + "title": "Klipsch R-120SW Powerful Detailed Home Speaker - Unit", + "minPrice": 32400, + "price": 32400, "vendor": "Klipsch" }, ... @@ -126,19 +158,19 @@ Some products will break into several items, each with a different variant name. [ ... { - "variant_name": "Red - $178.00", + "variant": "Red - $178.00", + "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xb950-extra-bass-wireless-headphones-with-app-control", "title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control", - "min_price": "128.00", + "minPrice": 12800, "price": null, - "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xb950-extra-bass-wireless-headphones-with-app-control", "vendor": "Sony" }, { - "variant_name": "Black - $178.00", + "variant": "Black - $178.00", + "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xb950-extra-bass-wireless-headphones-with-app-control", "title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control", - "min_price": "128.00", + "minPrice": 12800, "price": null, - "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xb950-extra-bass-wireless-headphones-with-app-control", "vendor": "Sony" }, ... @@ -152,11 +184,11 @@ Perhaps surprisingly, some products with variants will have the price field set. [ ... { - "variant_name": "Red - $74.95", - "title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker", - "min_price": "74.95", - "price": "74.95", + "variant": "Red - $74.95", "url": "https://warehouse-theme-metal.myshopify.com/products/jbl-flip-4-waterproof-portable-bluetooth-speaker", + "title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker", + "minPrice": 7495, + "price": 7495, "vendor": "JBL" }, ... diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md index 7c799759e..f442b8f96 100644 --- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md @@ -88,7 +88,7 @@ for product in listing_soup.select(".product-item"): data.append(item) ``` -The CSS selector `.product-form__option.no-js` matches elements with both `product-form__option` and `no-js` classes. Then we're using the [descendant combinator](https://developer.mozilla.org/en-US/docs/Web/CSS/Descendant_combinator) to match all `option` elements somewhere inside the `.product-form__option.no-js` wrapper. +The CSS selector `.product-form__option.no-js` targets elements that have both the `product-form__option` and `no-js` classes. We then use the [descendant combinator](https://developer.mozilla.org/en-US/docs/Web/CSS/Descendant_combinator) to match all `option` elements nested within the `.product-form__option.no-js` wrapper. Python dictionaries are mutable, so if we assigned the variant with `item["variant_name"] = ...`, we'd always overwrite the values. Instead of saving an item for each variant, we'd end up with the last variant repeated several times. To avoid this, we create a new dictionary for each variant and merge it with the `item` data before adding it to `data`. If we don't find any variants, we add the `item` as is, leaving the `variant_name` key empty. From 3e65fb0b26dd118cf80e4ed2824bb0c2b5496979 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 2 Jul 2025 10:12:09 +0200 Subject: [PATCH 24/26] feat: update the rest of scraping variants to be about JS --- .../11_scraping_variants.md | 225 +++++++++--------- .../11_scraping_variants.md | 2 + 2 files changed, 120 insertions(+), 107 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md index db8fc4049..651a7cd6c 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md @@ -197,110 +197,121 @@ Perhaps surprisingly, some products with variants will have the price field set. ## Parsing price -The items now contain the variant as text, which is good for a start, but we want the price to be in the `price` key. Let's introduce a new function to handle that: +The items now contain the variant as text, which is good for a start, but we want the price to be in the `price` property. Let's introduce a new function to handle that: -```py -def parse_variant(variant): - text = variant.text.strip() - name, price_text = text.split(" - ") - price = Decimal( - price_text - .replace("$", "") - .replace(",", "") - ) - return {"variant_name": name, "price": price} +```js +function parseVariant($option) { + const [variantName, priceText] = $option + .text() + .trim() + .split(" - "); + const price = parseInt( + priceText + .replace("$", "") + .replace(".", "") + .replace(",", "") + ); + return { variantName, price }; +} ``` -First, we split the text into two parts, then we parse the price as a decimal number. This part is similar to what we already do for parsing product listing prices. The function returns a dictionary we can merge with `item`. +First, we split the text into two parts, then we parse the price as a number. This part is similar to what we already do for parsing product listing prices. The function returns an object we can merge with `item`. ## Saving price Now, if we use our new function, we should finally get a program that can scrape exact prices for all products, even if they have variants. The whole code should look like this now: -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal -import json -import csv -from urllib.parse import urljoin - -def download(url): - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - return BeautifulSoup(html_code, "html.parser") - -def parse_product(product, base_url): - title_element = product.select_one(".product-item__title") - title = title_element.text.strip() - url = urljoin(base_url, title_element["href"]) - - price_text = ( - product - .select_one(".price") - .contents[-1] - .strip() - .replace("$", "") - .replace(",", "") - ) - if price_text.startswith("From "): - min_price = Decimal(price_text.removeprefix("From ")) - price = None - else: - min_price = Decimal(price_text) - price = min_price - - return {"title": title, "min_price": min_price, "price": price, "url": url} - -def parse_variant(variant): - text = variant.text.strip() - name, price_text = text.split(" - ") - price = Decimal( - price_text - .replace("$", "") - .replace(",", "") - ) - return {"variant_name": name, "price": price} - -def export_csv(file, data): - fieldnames = list(data[0].keys()) - writer = csv.DictWriter(file, fieldnames=fieldnames) - writer.writeheader() - for row in data: - writer.writerow(row) - -def export_json(file, data): - def serialize(obj): - if isinstance(obj, Decimal): - return str(obj) - raise TypeError("Object not JSON serializable") - - json.dump(data, file, default=serialize, indent=2) - -listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -listing_soup = download(listing_url) - -data = [] -for product in listing_soup.select(".product-item"): - item = parse_product(product, listing_url) - product_soup = download(item["url"]) - vendor = product_soup.select_one(".product-meta__vendor").text.strip() - - if variants := product_soup.select(".product-form__option.no-js option"): - for variant in variants: - # highlight-next-line - data.append(item | parse_variant(variant)) - else: - item["variant_name"] = None - data.append(item) - -with open("products.csv", "w") as file: - export_csv(file, data) - -with open("products.json", "w") as file: - export_json(file, data) +```js +import * as cheerio from 'cheerio'; +import { writeFile } from 'fs/promises'; +import { AsyncParser } from '@json2csv/node'; + +async function download(url) { + const response = await fetch(url); + if (response.ok) { + const html = await response.text(); + return cheerio.load(html); + } else { + throw new Error(`HTTP ${response.status}`); + } +} + +function parseProduct(productItem, baseURL) { + const title = productItem.find(".product-item__title"); + const titleText = title.text().trim(); + const url = new URL(title.attr("href"), baseURL).href; + + const price = productItem.find(".price").contents().last(); + const priceRange = { minPrice: null, price: null }; + const priceText = price + .text() + .trim() + .replace("$", "") + .replace(".", "") + .replace(",", ""); + + if (priceText.startsWith("From ")) { + priceRange.minPrice = parseInt(priceText.replace("From ", "")); + } else { + priceRange.minPrice = parseInt(priceText); + priceRange.price = priceRange.minPrice; + } + + return { url, title: titleText, ...priceRange }; +} + +async function exportJSON(data) { + return JSON.stringify(data, null, 2); +} + +async function exportCSV(data) { + const parser = new AsyncParser(); + return await parser.parse(data).promise(); +} + +// highlight-start +function parseVariant($option) { + const [variantName, priceText] = $option + .text() + .trim() + .split(" - "); + const price = parseInt( + priceText + .replace("$", "") + .replace(".", "") + .replace(",", "") + ); + return { variantName, price }; +} +// highlight-end + +const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales" +const $ = await download(listingURL); + +const $promises = $(".product-item").map(async (i, element) => { + const $productItem = $(element); + const item = parseProduct($productItem, listingURL); + + const $p = await download(item.url); + item.vendor = $p(".product-meta__vendor").text().trim(); + + const $items = $p(".product-form__option.no-js option").map((j, element) => { + // highlight-next-line + const variant = parseVariant($(element)); + // highlight-next-line + return { ...item, ...variant }; + }); + + if ($items.length > 0) { + return $items.get(); + } + return [{ variantName: null, ...item }]; +}); +const itemLists = await Promise.all($promises.get()); +const data = itemLists.flat(); + +await writeFile('products.json', await exportJSON(data)); +await writeFile('products.csv', await exportCSV(data)); ``` Let's run the scraper and see if all the items in the data contain prices: @@ -310,26 +321,26 @@ Let's run the scraper and see if all the items in the data contain prices: [ ... { - "variant_name": "Red", - "title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control", - "min_price": "128.00", - "price": "178.00", "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xb950-extra-bass-wireless-headphones-with-app-control", - "vendor": "Sony" + "title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control", + "minPrice": 12800, + "price": 17800, + "vendor": "Sony", + "variantName": "Red" }, { - "variant_name": "Black", - "title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control", - "min_price": "128.00", - "price": "178.00", "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xb950-extra-bass-wireless-headphones-with-app-control", - "vendor": "Sony" + "title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control", + "minPrice": 12800, + "price": 17800, + "vendor": "Sony", + "variantName": "Black" }, ... ] ``` -Success! We managed to build a Python application for watching prices! +Success! We managed to build a Node.js application for watching prices! Is this the end? Maybe! In the next lesson, we'll use a scraping framework to build the same application, but with less code, faster requests, and better visibility into what's happening while we wait for the program to finish. diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md index f442b8f96..b484e8751 100644 --- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md @@ -221,6 +221,7 @@ def parse_product(product, base_url): return {"title": title, "min_price": min_price, "price": price, "url": url} +# highlight-start def parse_variant(variant): text = variant.text.strip() name, price_text = text.split(" - ") @@ -230,6 +231,7 @@ def parse_variant(variant): .replace(",", "") ) return {"variant_name": name, "price": price} +# highlight-end def export_json(file, data): def serialize(obj): From 20278bdd37e39322c9995762b3da770a11407577 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 2 Jul 2025 10:26:27 +0200 Subject: [PATCH 25/26] feat: update one scraping variants exercise to be about JS --- .../11_scraping_variants.md | 60 ++++++++++--------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md index 651a7cd6c..073e8a8c3 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md @@ -418,8 +418,8 @@ You can find everything you need for working with dates and times in Python's [` Scrape the [CNN Sports](https://edition.cnn.com/sport) homepage. For each linked article, calculate its length in characters: - Locate the element that holds the main content of the article. -- Use [`get_text()`](https://beautiful-soup-4.readthedocs.io/en/latest/index.html#get-text) to extract all the content as plain text. -- Use `len()` to calculate the character count. +- Use `.text()` to extract all the content as plain text. +- Use `.length` to calculate the character count. Skip pages without text (like those that only have a video). Sort the results and print the URL of the shortest article that made it to the homepage. @@ -428,32 +428,38 @@ At the time of writing, the shortest article on the CNN Sports homepage is [abou
Solution - ```py - import httpx - from bs4 import BeautifulSoup - from urllib.parse import urljoin + ```js + import * as cheerio from 'cheerio'; + + async function download(url) { + const response = await fetch(url); + if (response.ok) { + const html = await response.text(); + return cheerio.load(html); + } else { + throw new Error(`HTTP ${response.status}`); + } + } + + const listingURL = "https://edition.cnn.com/sport"; + const $ = await download(listingURL); + + const $promises = $(".layout__main .card").map(async (i, element) => { + const $link = $(element).find("a").first(); + const articleURL = new URL($link.attr("href"), listingURL).href; + + const $a = await download(articleURL); + const content = $a(".article__content").text().trim(); + + return { url: articleURL, length: content.length }; + }); + + const data = await Promise.all($promises.get()); + const nonZeroData = data.filter(({ url, length }) => length > 0); + nonZeroData.sort((a, b) => a.length - b.length); + const shortestItem = nonZeroData[0]; - def download(url): - response = httpx.get(url) - response.raise_for_status() - return BeautifulSoup(response.text, "html.parser") - - listing_url = "https://edition.cnn.com/sport" - listing_soup = download(listing_url) - - data = [] - for card in listing_soup.select(".layout__main .card"): - link = card.select_one(".container__link") - article_url = urljoin(listing_url, link["href"]) - article_soup = download(article_url) - if content := article_soup.select_one(".article__content"): - length = len(content.get_text()) - data.append((length, article_url)) - - data.sort() - shortest_item = data[0] - item_url = shortest_item[1] - print(item_url) + console.log(shortestItem.url); ```
From 36ad10bfd439b99d0d16b351c30ea9cdd4f5679f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 2 Jul 2025 15:48:59 +0200 Subject: [PATCH 26/26] feat: update another scraping variants exercise to be about JS --- .../11_scraping_variants.md | 137 +++++++++++------- .../11_scraping_variants.md | 2 +- 2 files changed, 89 insertions(+), 50 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md index 073e8a8c3..a1bc7f724 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md @@ -348,69 +348,108 @@ Is this the end? Maybe! In the next lesson, we'll use a scraping framework to bu -### Build a scraper for watching Python jobs +### Build a scraper for watching npm packages -You're able to build a scraper now, aren't you? Let's build another one! Python's official website has a [job board](https://www.python.org/jobs/). Scrape the job postings that match the following criteria: +You can build a scraper now, can't you? Let's build another one! From the registry at [npmjs.com](https://www.npmjs.com/), scrape information about npm packages that match the following criteria: -- Tagged as "Database" -- Posted within the last 60 days +- Have the keyword "llm" (as in _large language model_) +- Updated within the last two years ("2 years ago" is okay; "3 years ago" is too old) -For each job posting found, use [`pp()`](https://docs.python.org/3/library/pprint.html#pprint.pp) to print a dictionary containing the following data: +Print an array of the top 5 packages with the most dependents. Each package should be represented by an object containing the following data: -- Job title -- Company -- URL to the job posting -- Date of posting +- Name +- Description +- URL to the package detail page +- Number of dependents +- Number of downloads Your output should look something like this: -```py -{'title': 'Senior Full Stack Developer', - 'company': 'Baserow', - 'url': 'https://www.python.org/jobs/7705/', - 'posted_on': datetime.date(2024, 9, 16)} -{'title': 'Senior Python Engineer', - 'company': 'Active Prime', - 'url': 'https://www.python.org/jobs/7699/', - 'posted_on': datetime.date(2024, 9, 5)} -... +```js +[ + { + name: 'langchain', + url: 'https://www.npmjs.com/package/langchain', + description: 'Typescript bindings for langchain', + dependents: 735, + downloads: 3938 + }, + { + name: '@langchain/core', + url: 'https://www.npmjs.com/package/@langchain/core', + description: 'Core LangChain.js abstractions and schemas', + dependents: 730, + downloads: 5994 + }, + ... +] ``` -You can find everything you need for working with dates and times in Python's [`datetime`](https://docs.python.org/3/library/datetime.html) module, including `date.today()`, `datetime.fromisoformat()`, `datetime.date()`, and `timedelta()`. -
Solution - After inspecting the job board, you'll notice that job postings tagged as "Database" have a dedicated URL. We'll use that as our starting point, which saves us from having to scrape and check the tags manually. - - ```py - from pprint import pp - import httpx - from bs4 import BeautifulSoup - from urllib.parse import urljoin - from datetime import datetime, date, timedelta - - today = date.today() - jobs_url = "https://www.python.org/jobs/type/database/" - response = httpx.get(jobs_url) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") - - for job in soup.select(".list-recent-jobs li"): - link = job.select_one(".listing-company-name a") - - time = job.select_one(".listing-posted time") - posted_at = datetime.fromisoformat(time["datetime"]) - posted_on = posted_at.date() - posted_ago = today - posted_on - - if posted_ago <= timedelta(days=60): - title = link.text.strip() - company = list(job.select_one(".listing-company-name").stripped_strings)[-1] - url = urljoin(jobs_url, link["href"]) - pp({"title": title, "company": company, "url": url, "posted_on": posted_on}) + After inspecting the registry, you'll notice that packages with the keyword "llm" have a dedicated URL. Also, changing the sorting dropdown results in a page with its own URL. We'll use that as our starting point, which saves us from having to scrape the whole registry and then filter by keyword or sort by the number of dependents. + + ```js + import * as cheerio from 'cheerio'; + + async function download(url) { + const response = await fetch(url); + if (response.ok) { + const html = await response.text(); + return cheerio.load(html); + } else { + throw new Error(`HTTP ${response.status}`); + } + } + + const listingURL = "https://www.npmjs.com/search?page=0&q=keywords%3Allm&sortBy=dependent_count"; + const $ = await download(listingURL); + + const $promises = $("section").map(async (i, element) => { + const $card = $(element); + + const details = $card + .children() + .first() + .children() + .last() + .text() + .split("•"); + const updatedText = details[2].trim(); + const dependents = parseInt(details[3].replace("dependents", "").trim()); + + if (updatedText.includes("years ago")) { + const yearsAgo = parseInt(updatedText.replace("years ago", "").trim()); + if (yearsAgo > 2) { + return null; + } + } + + const $link = $card.find("a").first(); + const name = $link.text().trim(); + const url = new URL($link.attr("href"), listingURL).href; + const description = $card.find("p").text().trim(); + + const downloadsText = $card + .children() + .last() + .text() + .replace(",", "") + .trim(); + const downloads = parseInt(downloadsText); + + return { name, url, description, dependents, downloads }; + }); + + const data = await Promise.all($promises.get()); + console.log(data.filter(item => item !== null).splice(0, 5)); ``` + Since the HTML doesn't contain any descriptive classes, we must rely on its structure. We're using [`.children()`](https://cheerio.js.org/docs/api/classes/Cheerio#children) to carefully navigate the HTML element tree. + + For items older than 2 years, we return `null` instead of an item. Before printing the results, we use [.filter()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/filter) to remove these empty values and [.splice()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/splice) the array down to just 5 items. +
### Find the shortest CNN article which made it to the Sports homepage diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md index b484e8751..98f04a761 100644 --- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md @@ -308,7 +308,7 @@ Is this the end? Maybe! In the next lesson, we'll use a scraping framework to bu ### Build a scraper for watching Python jobs -You're able to build a scraper now, aren't you? Let's build another one! Python's official website has a [job board](https://www.python.org/jobs/). Scrape the job postings that match the following criteria: +You can build a scraper now, can't you? Let's build another one! Python's official website has a [job board](https://www.python.org/jobs/). Scrape the job postings that match the following criteria: - Tagged as "Database" - Posted within the last 60 days