Skip to content

Commit 089a93a

Browse files
Added new function to inspect sitemap links
1 parent 3fc45ca commit 089a93a

File tree

1 file changed

+36
-0
lines changed

1 file changed

+36
-0
lines changed

pagesearch/pagesearch_deepsearch.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import requests
2+
from colorama import Fore, Style
3+
from bs4 import BeautifulSoup
4+
import re
5+
import os
6+
def sitemap_inspection(report_folder):
7+
try:
8+
accessed_links_counter = 0
9+
print(Fore.GREEN + "Trying to access sitemap_links.txt file..." + Style.RESET_ALL)
10+
with open(report_folder + '//03-sitemap_links.txt', "r") as file:
11+
links = file.readlines()
12+
print(Fore.GREEN + "Reading file and forming links list..." + Style.RESET_ALL)
13+
ps_docs_path = report_folder + '//sitemap_inspection'
14+
if not os.path.exists(ps_docs_path):
15+
os.makedirs(ps_docs_path)
16+
total_emails = []
17+
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
18+
links = [link.strip() for link in links]
19+
total_links_counter = len(links)
20+
for url in links:
21+
response = requests.get(url)
22+
if response.status_code == 200:
23+
accessed_links_counter += 1
24+
soup = BeautifulSoup(response.content, 'html.parser')
25+
emails = re.findall(email_pattern, soup.text)
26+
total_emails.append(emails)
27+
ds_emails_list = [x for x in total_emails if x]
28+
ds_emails_cleaned = [', '.join(sublist) for sublist in ds_emails_list]
29+
ds_emails_return = list(set(ds_emails_cleaned))
30+
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")
31+
print(Fore.GREEN + f"During PageSearch Sitemap Inspection process:\n[+] Total {total_links_counter} links were checked")
32+
print(Fore.GREEN + f"[+] Among them, {accessed_links_counter} links were accessible")
33+
print(Fore.GREEN + f"[+] In result, {len(ds_emails_return)} unique e-mail addresses were found")
34+
return ds_emails_return
35+
except FileNotFoundError as e:
36+
print(Fore.RED + f"Cannot start PageSearch in Deep Mode because sitemap_links.txt file doesn't exist {e}" + Style.RESET_ALL)

0 commit comments

Comments
 (0)