Skip to content

Commit 74227a4

Browse files
Minor fixes for PageSearch SI mode
1 parent 12e4060 commit 74227a4

File tree

1 file changed

+35
-30
lines changed

1 file changed

+35
-30
lines changed

pagesearch/pagesearch_deepsearch.py

Lines changed: 35 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,34 +3,39 @@
33
from bs4 import BeautifulSoup
44
import re
55
import os
6+
67
def sitemap_inspection(report_folder):
7-
try:
8-
accessed_links_counter = 0
9-
print(Fore.GREEN + "Trying to access sitemap_links.txt file..." + Style.RESET_ALL)
10-
with open(report_folder + '//03-sitemap_links.txt', "r") as file:
11-
links = file.readlines()
12-
print(Fore.GREEN + "Reading file and forming links list..." + Style.RESET_ALL)
13-
ps_docs_path = report_folder + '//sitemap_inspection'
14-
if not os.path.exists(ps_docs_path):
15-
os.makedirs(ps_docs_path)
16-
total_emails = []
17-
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
18-
links = [link.strip() for link in links]
19-
total_links_counter = len(links)
20-
for url in links:
21-
response = requests.get(url)
22-
if response.status_code == 200:
23-
accessed_links_counter += 1
24-
soup = BeautifulSoup(response.content, 'html.parser')
25-
emails = re.findall(email_pattern, soup.text)
26-
total_emails.append(emails)
27-
ds_emails_list = [x for x in total_emails if x]
28-
ds_emails_cleaned = [', '.join(sublist) for sublist in ds_emails_list]
29-
ds_emails_return = list(set(ds_emails_cleaned))
30-
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")
31-
print(Fore.GREEN + f"During PageSearch Sitemap Inspection process:\n[+] Total {total_links_counter} links were checked")
32-
print(Fore.GREEN + f"[+] Among them, {accessed_links_counter} links were accessible")
33-
print(Fore.GREEN + f"[+] In result, {len(ds_emails_return)} unique e-mail addresses were found")
34-
return ds_emails_return
35-
except FileNotFoundError as e:
36-
print(Fore.RED + f"Cannot start PageSearch in Deep Mode because sitemap_links.txt file doesn't exist {e}" + Style.RESET_ALL)
8+
if os.path.exists(report_folder + '//03-sitemap_links.txt'):
9+
try:
10+
accessed_links_counter = 0
11+
print(Fore.GREEN + "Trying to access sitemap_links.txt file..." + Style.RESET_ALL)
12+
with open(report_folder + '//03-sitemap_links.txt', "r") as file:
13+
links = file.readlines()
14+
print(Fore.GREEN + "Reading file and forming links list..." + Style.RESET_ALL)
15+
ps_docs_path = report_folder + '//sitemap_inspection'
16+
if not os.path.exists(ps_docs_path):
17+
os.makedirs(ps_docs_path)
18+
total_emails = []
19+
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
20+
links = [link.strip() for link in links]
21+
total_links_counter = len(links)
22+
print(Fore.GREEN + "Gathering e-mails..." + Style.RESET_ALL)
23+
for url in links:
24+
response = requests.get(url)
25+
if response.status_code == 200:
26+
accessed_links_counter += 1
27+
soup = BeautifulSoup(response.content, 'html.parser')
28+
emails = re.findall(email_pattern, soup.text)
29+
total_emails.append(emails)
30+
ds_emails_list = [x for x in total_emails if x]
31+
ds_emails_cleaned = [', '.join(sublist) for sublist in ds_emails_list]
32+
ds_emails_return = list(set(ds_emails_cleaned))
33+
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")
34+
print(Fore.GREEN + f"\nDuring PageSearch Sitemap Inspection process:\n[+] Total {total_links_counter} links were checked")
35+
print(Fore.GREEN + f"[+] Among them, {accessed_links_counter} links were accessible")
36+
print(Fore.GREEN + f"[+] In result, {len(ds_emails_return)} unique e-mail addresses were found")
37+
return ds_emails_return
38+
except FileNotFoundError:
39+
print(Fore.RED + f"Cannot start PageSearch in Deep Mode because sitemap_links.txt file doesn't exist" + Style.RESET_ALL)
40+
else:
41+
print(Fore.RED + f"Cannot start PageSearch in Deep Mode because sitemap_links.txt file doesn't exist" + Style.RESET_ALL)

0 commit comments

Comments
 (0)