From 48dc6c98810c24d90d90e6d70f4e819b4048d6d1 Mon Sep 17 00:00:00 2001
From: OtmanDaoudi <daoudiotman22@gmail.com>
Date: Thu, 9 Nov 2023 22:03:30 +0100
Subject: [PATCH] fix bad url  and div id

---
 scratch/nlp.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scratch/nlp.py b/scratch/nlp.py
index f31e92f2..8f8c155d 100644
--- a/scratch/nlp.py
+++ b/scratch/nlp.py
@@ -20,13 +20,12 @@ def fix_unicode(text: str) -> str:
 from bs4 import BeautifulSoup
 import requests
 
-url = "https://www.oreilly.com/ideas/what-is-data-science"
+url = "http://radar.oreilly.com/2010/06/what-is-data-science.html"
 html = requests.get(url).text
 soup = BeautifulSoup(html, 'html5lib')
 
-content = soup.find("div", "article-body")   # find article-body div
-regex = r"[\w']+|[\.]"                       # matches a word or a period
-
+content = soup.find("div", id="body-content")  # find article-body div
+regex = r"[\w']+|[\.]"  # matches a word or a period
 document = []
 
 for paragraph in content("p"):