3
3
from bs4 import BeautifulSoup
4
4
import re
5
5
import os
6
+
6
7
def sitemap_inspection (report_folder ):
7
- try :
8
- accessed_links_counter = 0
9
- print (Fore .GREEN + "Trying to access sitemap_links.txt file..." + Style .RESET_ALL )
10
- with open (report_folder + '//03-sitemap_links.txt' , "r" ) as file :
11
- links = file .readlines ()
12
- print (Fore .GREEN + "Reading file and forming links list..." + Style .RESET_ALL )
13
- ps_docs_path = report_folder + '//sitemap_inspection'
14
- if not os .path .exists (ps_docs_path ):
15
- os .makedirs (ps_docs_path )
16
- total_emails = []
17
- email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
18
- links = [link .strip () for link in links ]
19
- total_links_counter = len (links )
20
- for url in links :
21
- response = requests .get (url )
22
- if response .status_code == 200 :
23
- accessed_links_counter += 1
24
- soup = BeautifulSoup (response .content , 'html.parser' )
25
- emails = re .findall (email_pattern , soup .text )
26
- total_emails .append (emails )
27
- ds_emails_list = [x for x in total_emails if x ]
28
- ds_emails_cleaned = [', ' .join (sublist ) for sublist in ds_emails_list ]
29
- ds_emails_return = list (set (ds_emails_cleaned ))
30
- print (Fore .LIGHTGREEN_EX + "-------------------------------------------------" )
31
- print (Fore .GREEN + f"During PageSearch Sitemap Inspection process:\n [+] Total { total_links_counter } links were checked" )
32
- print (Fore .GREEN + f"[+] Among them, { accessed_links_counter } links were accessible" )
33
- print (Fore .GREEN + f"[+] In result, { len (ds_emails_return )} unique e-mail addresses were found" )
34
- return ds_emails_return
35
- except FileNotFoundError as e :
36
- print (Fore .RED + f"Cannot start PageSearch in Deep Mode because sitemap_links.txt file doesn't exist { e } " + Style .RESET_ALL )
8
+ if os .path .exists (report_folder + '//03-sitemap_links.txt' ):
9
+ try :
10
+ accessed_links_counter = 0
11
+ print (Fore .GREEN + "Trying to access sitemap_links.txt file..." + Style .RESET_ALL )
12
+ with open (report_folder + '//03-sitemap_links.txt' , "r" ) as file :
13
+ links = file .readlines ()
14
+ print (Fore .GREEN + "Reading file and forming links list..." + Style .RESET_ALL )
15
+ ps_docs_path = report_folder + '//sitemap_inspection'
16
+ if not os .path .exists (ps_docs_path ):
17
+ os .makedirs (ps_docs_path )
18
+ total_emails = []
19
+ email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
20
+ links = [link .strip () for link in links ]
21
+ total_links_counter = len (links )
22
+ print (Fore .GREEN + "Gathering e-mails..." + Style .RESET_ALL )
23
+ for url in links :
24
+ response = requests .get (url )
25
+ if response .status_code == 200 :
26
+ accessed_links_counter += 1
27
+ soup = BeautifulSoup (response .content , 'html.parser' )
28
+ emails = re .findall (email_pattern , soup .text )
29
+ total_emails .append (emails )
30
+ ds_emails_list = [x for x in total_emails if x ]
31
+ ds_emails_cleaned = [', ' .join (sublist ) for sublist in ds_emails_list ]
32
+ ds_emails_return = list (set (ds_emails_cleaned ))
33
+ print (Fore .LIGHTGREEN_EX + "-------------------------------------------------" )
34
+ print (Fore .GREEN + f"\n During PageSearch Sitemap Inspection process:\n [+] Total { total_links_counter } links were checked" )
35
+ print (Fore .GREEN + f"[+] Among them, { accessed_links_counter } links were accessible" )
36
+ print (Fore .GREEN + f"[+] In result, { len (ds_emails_return )} unique e-mail addresses were found" )
37
+ return ds_emails_return
38
+ except FileNotFoundError :
39
+ print (Fore .RED + f"Cannot start PageSearch in Deep Mode because sitemap_links.txt file doesn't exist" + Style .RESET_ALL )
40
+ else :
41
+ print (Fore .RED + f"Cannot start PageSearch in Deep Mode because sitemap_links.txt file doesn't exist" + Style .RESET_ALL )
0 commit comments