1
+ import requests
2
+ from colorama import Fore , Style
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+ import os
6
+ def sitemap_inspection (report_folder ):
7
+ try :
8
+ accessed_links_counter = 0
9
+ print (Fore .GREEN + "Trying to access sitemap_links.txt file..." + Style .RESET_ALL )
10
+ with open (report_folder + '//03-sitemap_links.txt' , "r" ) as file :
11
+ links = file .readlines ()
12
+ print (Fore .GREEN + "Reading file and forming links list..." + Style .RESET_ALL )
13
+ ps_docs_path = report_folder + '//sitemap_inspection'
14
+ if not os .path .exists (ps_docs_path ):
15
+ os .makedirs (ps_docs_path )
16
+ total_emails = []
17
+ email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
18
+ links = [link .strip () for link in links ]
19
+ total_links_counter = len (links )
20
+ for url in links :
21
+ response = requests .get (url )
22
+ if response .status_code == 200 :
23
+ accessed_links_counter += 1
24
+ soup = BeautifulSoup (response .content , 'html.parser' )
25
+ emails = re .findall (email_pattern , soup .text )
26
+ total_emails .append (emails )
27
+ ds_emails_list = [x for x in total_emails if x ]
28
+ ds_emails_cleaned = [', ' .join (sublist ) for sublist in ds_emails_list ]
29
+ ds_emails_return = list (set (ds_emails_cleaned ))
30
+ print (Fore .LIGHTGREEN_EX + "-------------------------------------------------" )
31
+ print (Fore .GREEN + f"During PageSearch Sitemap Inspection process:\n [+] Total { total_links_counter } links were checked" )
32
+ print (Fore .GREEN + f"[+] Among them, { accessed_links_counter } links were accessible" )
33
+ print (Fore .GREEN + f"[+] In result, { len (ds_emails_return )} unique e-mail addresses were found" )
34
+ return ds_emails_return
35
+ except FileNotFoundError as e :
36
+ print (Fore .RED + f"Cannot start PageSearch in Deep Mode because sitemap_links.txt file doesn't exist { e } " + Style .RESET_ALL )
0 commit comments