diff --git a/scrape_amazon/util/scrape.py b/scrape_amazon/util/scrape.py index 881d4a1..3414619 100644 --- a/scrape_amazon/util/scrape.py +++ b/scrape_amazon/util/scrape.py @@ -42,13 +42,16 @@ def extractPage(url: str) -> str: reviewDate = productPage.findAll("span", {"class": "review-date"}) ratingsSpan = productPage.findAll("i", {"class": "review-rating"}) reviewTitlesSpan = productPage.findAll("a", {"class": "review-title-content"}) + reviewTitlesSpan.extend( productPage.findAll("span", {"data-hook":"review-title", "class":"a-size-base review-title a-color-base review-title-content a-text-bold"}) ) # review from other country reviewDescriptionSpan = productPage.findAll( "span", {"class": "review-text-content"} ) # Loop is initiated from 2 because we have to exclude the Top Positive and # Top Critical Review which otherwise will get repeated. - for i in range(2, len(reviewrsSpan)): + # Check if Top critical review exist. + topReviewrsSpan = productPage.findAll("h4", {"class": "a-size-medium view-point-title"}) + for i in range(len(topReviewrsSpan), len(reviewrsSpan)): reviewers.append(reviewrsSpan[i].get_text()) ratings.append(int(ratingsSpan[i].get_text()[0])) matches = datefinder.find_dates(reviewDate[i].get_text()) @@ -75,10 +78,11 @@ def extractPage(url: str) -> str: def extractTotalPages(url): r = get_URL(url) productPage = BeautifulSoup(r.text, "html.parser") + # Check reviews instead of ratings pageSpanText = productPage.findAll( - "span", {"class": "a-size-base a-color-secondary"} + "div", {"data-hook" : "cr-filter-info-review-rating-count", "class": "a-row a-spacing-base a-size-base"} )[0].get_text() - totalReviews = int(re.findall(r"\d+", pageSpanText.replace(",", ""))[0]) + totalReviews = int(re.findall(r"\d+", pageSpanText.strip().split("ratings")[1].replace(",", ""))[0]) return ( math.ceil(totalReviews / 10), productPage.find("title").get_text(), @@ -92,7 +96,8 @@ def scrape_reviews(url): print(f"[scrape-amazon] Total Pages - {totalPages}") print(f"[scrape-amazon] Total Reviews - {totalReviews}\n") urlsToFetch = [] - for page in range(1, totalPages + 1): + # Amazon only display reviews in the first 500 pages + for page in range(1, min(totalPages + 1, 500) ): urlToFetch = url + f"?pageNumber={page}" urlsToFetch.append(urlToFetch) diff --git a/scrape_amazon/util/urlFunctions.py b/scrape_amazon/util/urlFunctions.py index fb12cf7..353b5d8 100644 --- a/scrape_amazon/util/urlFunctions.py +++ b/scrape_amazon/util/urlFunctions.py @@ -1,4 +1,5 @@ import requests +import time from my_fake_useragent import UserAgent url_prefix = "https://www.amazon" @@ -27,10 +28,15 @@ def get_URL(url: str) -> str: """ user_agent = ua.random() while True: - content: str = requests.get(url, headers={"User-Agent": user_agent}) - if "api-services-support@amazon.com" in content.text: - user_agent = ua.random() + # Avoid ConnectionError + try: + content: str = requests.get(url, headers={"User-Agent": user_agent}) + if "api-services-support@amazon.com" in content.text: + user_agent = ua.random() + continue + break + except: + time.sleep(1) continue - break - + return content