Skip to content

Commit 290206d

Browse files
Added e-mails return from PageSearch (#59)
1 parent f764feb commit 290206d

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

pagesearch/pagesearch_parsers.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,14 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
5555
if not os.path.exists(ps_docs_path):
5656
os.makedirs(ps_docs_path)
5757
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
58+
total_emails = []
5859
for url in subdomains_list:
5960
try:
6061
response = requests.get('http://' + url)
6162
soup = BeautifulSoup(response.content, 'html.parser')
6263
title = soup.title.string
6364
emails = re.findall(email_pattern, soup.text)
65+
total_emails.append(emails)
6466
if not emails:
6567
emails = ['None']
6668
print(Fore.GREEN + "Page URL: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{url}" + Style.RESET_ALL)
@@ -70,7 +72,6 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
7072
for link in links:
7173
href = link.get('href')
7274
if href:
73-
#print(f"Found link: {href}") # Debugging line
7475
if href.lower().endswith(('.docx', '.xlsx', '.csv', '.pdf', '.pptx', '.doc', '.ppt', '.xls', '.rtf')):
7576
document_url = 'http://' + url + href
7677
print(Fore.GREEN + "Found document: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{document_url}" + Style.RESET_ALL)
@@ -141,6 +142,9 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
141142
print(Fore.RED + "File extraction failed. Reason: {}".format(e) + Style.RESET_ALL)
142143
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------" + Style.RESET_ALL)
143144
pass
145+
ps_emails_list = [x for x in total_emails if x]
146+
ps_emails_return = [', '.join(sublist) for sublist in ps_emails_list]
147+
#print(ps_emails_return)
144148
clean_bad_pdfs(ps_docs_path)
145149
if keywords_flag == 1:
146150
print(Fore.GREEN + "Starting keywords searching in PDF files" + Style.RESET_ALL)
@@ -153,3 +157,4 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
153157
elif keywords_flag == 0:
154158
print(Fore.RED + "Keywords gathering won't start because of None user input" + Style.RESET_ALL)
155159
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------" + Style.RESET_ALL)
160+
return ps_emails_return

0 commit comments

Comments
 (0)