Skip to content

Commit 025e58d

Browse files
Added post-scan summary and some exception handlers
1 parent 74227a4 commit 025e58d

File tree

1 file changed

+98
-81
lines changed

1 file changed

+98
-81
lines changed

pagesearch/pagesearch_parsers.py

Lines changed: 98 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ def clean_bad_pdfs(ps_docs_path):
4444
bad_pdfs.append(pdf_file)
4545
pass
4646
if len(bad_pdfs) > 0:
47-
print(Fore.GREEN + f"Found {len(bad_pdfs)} corrupted PDF files. Deleting..." + Style.RESET_ALL)
4847
for pdfs in bad_pdfs:
4948
os.remove(os.path.join(ps_docs_path, pdfs))
5049
else:
@@ -56,88 +55,102 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
5655
os.makedirs(ps_docs_path)
5756
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
5857
total_emails = []
58+
accessible_subdomains = 0
59+
files_counter = 0
5960
for url in subdomains_list:
6061
try:
6162
response = requests.get('http://' + url)
62-
soup = BeautifulSoup(response.content, 'html.parser')
63-
title = soup.title.string
64-
emails = re.findall(email_pattern, soup.text)
65-
total_emails.append(emails)
66-
if not emails:
67-
emails = ['None']
68-
print(Fore.GREEN + "Page URL: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{url}" + Style.RESET_ALL)
69-
print(Fore.GREEN + "Page title: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{title}" + Style.RESET_ALL)
70-
print(Fore.GREEN + "Founded e-mails: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{', '.join(emails)}" + Style.RESET_ALL)
71-
links = soup.find_all('a')
72-
for link in links:
73-
href = link.get('href')
74-
if href:
75-
if href.lower().endswith(('.docx', '.xlsx', '.csv', '.pdf', '.pptx', '.doc', '.ppt', '.xls', '.rtf')):
76-
document_url = 'http://' + url + href
77-
print(Fore.GREEN + "Found document: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{document_url}" + Style.RESET_ALL)
78-
response = requests.get(document_url)
79-
if response.status_code == 200:
80-
if href and href.lower().endswith(('.docx')):
81-
filename = os.path.basename(href)
82-
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.docx")
83-
with open(extracted_path, 'wb') as file:
84-
file.write(response.content)
85-
print(Fore.GREEN + "File was successfully saved")
86-
elif href and href.lower().endswith(('.xlsx')):
87-
filename = os.path.basename(href)
88-
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.xlsx")
89-
with open(extracted_path, 'wb') as file:
90-
file.write(response.content)
91-
print(Fore.GREEN + "File was successfully saved")
92-
elif href and href.lower().endswith(('.pdf')):
93-
filename = os.path.basename(href)
94-
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.pdf")
95-
with open(extracted_path, 'wb') as file:
96-
file.write(response.content)
97-
print(Fore.GREEN + "File was successfully saved")
98-
elif href and href.lower().endswith(('.csv')):
99-
filename = os.path.basename(href)
100-
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.csv")
101-
with open(extracted_path, 'wb') as file:
102-
file.write(response.content)
103-
print(Fore.GREEN + "File was successfully saved")
104-
elif href and href.lower().endswith(('.pptx')):
105-
filename = os.path.basename(href)
106-
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.pptx")
107-
with open(extracted_path, 'wb') as file:
108-
file.write(response.content)
109-
print(Fore.GREEN + "File was successfully saved")
110-
elif href and href.lower().endswith(('.doc')):
111-
filename = os.path.basename(href)
112-
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.doc")
113-
with open(extracted_path, 'wb') as file:
114-
file.write(response.content)
115-
print(Fore.GREEN + "File was successfully saved")
116-
elif href and href.lower().endswith(('.ppt')):
117-
filename = os.path.basename(href)
118-
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.ppt")
119-
with open(extracted_path, 'wb') as file:
120-
file.write(response.content)
121-
print(Fore.GREEN + "File was successfully saved")
122-
elif href and href.lower().endswith(('.xls')):
123-
filename = os.path.basename(href)
124-
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.xls")
125-
with open(extracted_path, 'wb') as file:
126-
file.write(response.content)
127-
print(Fore.GREEN + "File was successfully saved")
128-
elif href and href.lower().endswith(('.json')):
129-
filename = os.path.basename(href)
130-
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.json")
131-
with open(extracted_path, 'wb') as file:
132-
file.write(response.content)
133-
print(Fore.GREEN + "File was successfully saved")
134-
elif href and href.lower().endswith(('.txt')):
135-
filename = os.path.basename(href)
136-
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.txt")
137-
with open(extracted_path, 'wb') as file:
138-
file.write(response.content)
139-
print(Fore.GREEN + "File was successfully saved")
140-
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")
63+
if response.status_code == 200:
64+
accessible_subdomains += 1
65+
soup = BeautifulSoup(response.content, 'html.parser')
66+
title = soup.title.string
67+
emails = re.findall(email_pattern, soup.text)
68+
total_emails.append(emails)
69+
if not emails:
70+
emails = ['None']
71+
print(Fore.GREEN + "Page URL: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{url}" + Style.RESET_ALL)
72+
print(Fore.GREEN + "Page title: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{title}" + Style.RESET_ALL)
73+
print(Fore.GREEN + "Founded e-mails: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{', '.join(emails)}" + Style.RESET_ALL)
74+
links = soup.find_all('a')
75+
for link in links:
76+
href = link.get('href')
77+
if href:
78+
if href.lower().endswith(('.docx', '.xlsx', '.csv', '.pdf', '.pptx', '.doc', '.ppt', '.xls', '.rtf')):
79+
document_url = 'http://' + url + href
80+
print(Fore.GREEN + "Found document: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{document_url}" + Style.RESET_ALL)
81+
response = requests.get(document_url)
82+
if response.status_code == 200:
83+
if href and href.lower().endswith(('.docx')):
84+
filename = os.path.basename(href)
85+
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.docx")
86+
with open(extracted_path, 'wb') as file:
87+
file.write(response.content)
88+
files_counter += 1
89+
print(Fore.GREEN + "File was successfully saved")
90+
elif href and href.lower().endswith(('.xlsx')):
91+
filename = os.path.basename(href)
92+
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.xlsx")
93+
with open(extracted_path, 'wb') as file:
94+
file.write(response.content)
95+
files_counter += 1
96+
print(Fore.GREEN + "File was successfully saved")
97+
elif href and href.lower().endswith(('.pdf')):
98+
filename = os.path.basename(href)
99+
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.pdf")
100+
with open(extracted_path, 'wb') as file:
101+
file.write(response.content)
102+
files_counter += 1
103+
print(Fore.GREEN + "File was successfully saved")
104+
elif href and href.lower().endswith(('.csv')):
105+
filename = os.path.basename(href)
106+
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.csv")
107+
with open(extracted_path, 'wb') as file:
108+
file.write(response.content)
109+
files_counter += 1
110+
print(Fore.GREEN + "File was successfully saved")
111+
elif href and href.lower().endswith(('.pptx')):
112+
filename = os.path.basename(href)
113+
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.pptx")
114+
with open(extracted_path, 'wb') as file:
115+
file.write(response.content)
116+
files_counter += 1
117+
print(Fore.GREEN + "File was successfully saved")
118+
elif href and href.lower().endswith(('.doc')):
119+
filename = os.path.basename(href)
120+
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.doc")
121+
with open(extracted_path, 'wb') as file:
122+
file.write(response.content)
123+
files_counter += 1
124+
print(Fore.GREEN + "File was successfully saved")
125+
elif href and href.lower().endswith(('.ppt')):
126+
filename = os.path.basename(href)
127+
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.ppt")
128+
with open(extracted_path, 'wb') as file:
129+
file.write(response.content)
130+
files_counter += 1
131+
print(Fore.GREEN + "File was successfully saved")
132+
elif href and href.lower().endswith(('.xls')):
133+
filename = os.path.basename(href)
134+
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.xls")
135+
with open(extracted_path, 'wb') as file:
136+
file.write(response.content)
137+
files_counter += 1
138+
print(Fore.GREEN + "File was successfully saved")
139+
elif href and href.lower().endswith(('.json')):
140+
filename = os.path.basename(href)
141+
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.json")
142+
with open(extracted_path, 'wb') as file:
143+
file.write(response.content)
144+
files_counter += 1
145+
print(Fore.GREEN + "File was successfully saved")
146+
elif href and href.lower().endswith(('.txt')):
147+
filename = os.path.basename(href)
148+
extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.txt")
149+
with open(extracted_path, 'wb') as file:
150+
file.write(response.content)
151+
files_counter += 1
152+
print(Fore.GREEN + "File was successfully saved")
153+
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")
141154
except Exception as e:
142155
print(Fore.RED + "File extraction failed. Reason: {}".format(e) + Style.RESET_ALL)
143156
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------" + Style.RESET_ALL)
@@ -147,7 +160,7 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
147160
clean_bad_pdfs(ps_docs_path)
148161

149162
if keywords_flag == 1:
150-
print(Fore.GREEN + "Starting keywords searching in PDF files" + Style.RESET_ALL)
163+
print(Fore.GREEN + "Searching keywords in PDF files..." + Style.RESET_ALL)
151164
try:
152165
pdf_results = find_keywords_in_pdfs(ps_docs_path, keywords)
153166
for pdf_file, found_keywords in pdf_results.items():
@@ -157,4 +170,8 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
157170
elif keywords_flag == 0:
158171
print(Fore.RED + "Keywords gathering won't start because of None user input" + Style.RESET_ALL)
159172
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------" + Style.RESET_ALL)
173+
print(Fore.GREEN + f"\nDuring PageSearch process:\n[+] Total {len(subdomains_list)} subdomains were checked")
174+
print(Fore.GREEN + f"[+] Among them, {accessible_subdomains} subdomains were accessible")
175+
print(Fore.GREEN + f"[+] In result, {len(ps_emails_return)} unique e-mail addresses were found")
176+
print(Fore.GREEN + f"[+] Also, {files_counter} files were extracted")
160177
return ps_emails_return

0 commit comments

Comments
 (0)