From 48dc6c98810c24d90d90e6d70f4e819b4048d6d1 Mon Sep 17 00:00:00 2001 From: OtmanDaoudi Date: Thu, 9 Nov 2023 22:03:30 +0100 Subject: [PATCH] fix bad url and div id --- scratch/nlp.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scratch/nlp.py b/scratch/nlp.py index f31e92f2..8f8c155d 100644 --- a/scratch/nlp.py +++ b/scratch/nlp.py @@ -20,13 +20,12 @@ def fix_unicode(text: str) -> str: from bs4 import BeautifulSoup import requests -url = "https://www.oreilly.com/ideas/what-is-data-science" +url = "http://radar.oreilly.com/2010/06/what-is-data-science.html" html = requests.get(url).text soup = BeautifulSoup(html, 'html5lib') -content = soup.find("div", "article-body") # find article-body div -regex = r"[\w']+|[\.]" # matches a word or a period - +content = soup.find("div", id="body-content") # find article-body div +regex = r"[\w']+|[\.]" # matches a word or a period document = [] for paragraph in content("p"):