diff --git a/scratch/nlp.py b/scratch/nlp.py index f31e92f2..6d0dafbd 100644 --- a/scratch/nlp.py +++ b/scratch/nlp.py @@ -24,7 +24,7 @@ def fix_unicode(text: str) -> str: html = requests.get(url).text soup = BeautifulSoup(html, 'html5lib') -content = soup.find("div", "article-body") # find article-body div +content = soup.find("div", "main-post-radar-content") # find main-post-radar-content div regex = r"[\w']+|[\.]" # matches a word or a period document = [] @@ -643,7 +643,7 @@ def text_size(total: int) -> float: companies = list({b.text for b in soup("b") if "h4" in b.get("class", ())}) - assert len(companies) == 101 + assert len(companies) == 102 vocab = Vocabulary([c for company in companies for c in company]) @@ -721,4 +721,4 @@ def generate(seed: str = START, max_len: int = 50) -> str: if epoch == 200: optimizer.lr *= 0.1 -if __name__ == "__main__": main() \ No newline at end of file +if __name__ == "__main__": main()