From a32ebca9c01ecfd09f17dcfbbe30be535ddb1d64 Mon Sep 17 00:00:00 2001 From: Ronald Petty Date: Sat, 16 May 2020 19:34:36 -0700 Subject: [PATCH] Update nlp.py * changing article-body to main-post-radar-content div class * ycombinator changed list to 102 companies from 101 --- scratch/nlp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scratch/nlp.py b/scratch/nlp.py index f31e92f2..6d0dafbd 100644 --- a/scratch/nlp.py +++ b/scratch/nlp.py @@ -24,7 +24,7 @@ def fix_unicode(text: str) -> str: html = requests.get(url).text soup = BeautifulSoup(html, 'html5lib') -content = soup.find("div", "article-body") # find article-body div +content = soup.find("div", "main-post-radar-content") # find main-post-radar-content div regex = r"[\w']+|[\.]" # matches a word or a period document = [] @@ -643,7 +643,7 @@ def text_size(total: int) -> float: companies = list({b.text for b in soup("b") if "h4" in b.get("class", ())}) - assert len(companies) == 101 + assert len(companies) == 102 vocab = Vocabulary([c for company in companies for c in company]) @@ -721,4 +721,4 @@ def generate(seed: str = START, max_len: int = 50) -> str: if epoch == 200: optimizer.lr *= 0.1 -if __name__ == "__main__": main() \ No newline at end of file +if __name__ == "__main__": main()