@@ -44,7 +44,6 @@ def clean_bad_pdfs(ps_docs_path):
44
44
bad_pdfs .append (pdf_file )
45
45
pass
46
46
if len (bad_pdfs ) > 0 :
47
- print (Fore .GREEN + f"Found { len (bad_pdfs )} corrupted PDF files. Deleting..." + Style .RESET_ALL )
48
47
for pdfs in bad_pdfs :
49
48
os .remove (os .path .join (ps_docs_path , pdfs ))
50
49
else :
@@ -56,88 +55,102 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
56
55
os .makedirs (ps_docs_path )
57
56
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
58
57
total_emails = []
58
+ accessible_subdomains = 0
59
+ files_counter = 0
59
60
for url in subdomains_list :
60
61
try :
61
62
response = requests .get ('http://' + url )
62
- soup = BeautifulSoup (response .content , 'html.parser' )
63
- title = soup .title .string
64
- emails = re .findall (email_pattern , soup .text )
65
- total_emails .append (emails )
66
- if not emails :
67
- emails = ['None' ]
68
- print (Fore .GREEN + "Page URL: " + Fore .LIGHTCYAN_EX + Style .BRIGHT + f"{ url } " + Style .RESET_ALL )
69
- print (Fore .GREEN + "Page title: " + Fore .LIGHTCYAN_EX + Style .BRIGHT + f"{ title } " + Style .RESET_ALL )
70
- print (Fore .GREEN + "Founded e-mails: " + Fore .LIGHTCYAN_EX + Style .BRIGHT + f"{ ', ' .join (emails )} " + Style .RESET_ALL )
71
- links = soup .find_all ('a' )
72
- for link in links :
73
- href = link .get ('href' )
74
- if href :
75
- if href .lower ().endswith (('.docx' , '.xlsx' , '.csv' , '.pdf' , '.pptx' , '.doc' , '.ppt' , '.xls' , '.rtf' )):
76
- document_url = 'http://' + url + href
77
- print (Fore .GREEN + "Found document: " + Fore .LIGHTCYAN_EX + Style .BRIGHT + f"{ document_url } " + Style .RESET_ALL )
78
- response = requests .get (document_url )
79
- if response .status_code == 200 :
80
- if href and href .lower ().endswith (('.docx' )):
81
- filename = os .path .basename (href )
82
- extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .docx" )
83
- with open (extracted_path , 'wb' ) as file :
84
- file .write (response .content )
85
- print (Fore .GREEN + "File was successfully saved" )
86
- elif href and href .lower ().endswith (('.xlsx' )):
87
- filename = os .path .basename (href )
88
- extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .xlsx" )
89
- with open (extracted_path , 'wb' ) as file :
90
- file .write (response .content )
91
- print (Fore .GREEN + "File was successfully saved" )
92
- elif href and href .lower ().endswith (('.pdf' )):
93
- filename = os .path .basename (href )
94
- extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .pdf" )
95
- with open (extracted_path , 'wb' ) as file :
96
- file .write (response .content )
97
- print (Fore .GREEN + "File was successfully saved" )
98
- elif href and href .lower ().endswith (('.csv' )):
99
- filename = os .path .basename (href )
100
- extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .csv" )
101
- with open (extracted_path , 'wb' ) as file :
102
- file .write (response .content )
103
- print (Fore .GREEN + "File was successfully saved" )
104
- elif href and href .lower ().endswith (('.pptx' )):
105
- filename = os .path .basename (href )
106
- extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .pptx" )
107
- with open (extracted_path , 'wb' ) as file :
108
- file .write (response .content )
109
- print (Fore .GREEN + "File was successfully saved" )
110
- elif href and href .lower ().endswith (('.doc' )):
111
- filename = os .path .basename (href )
112
- extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .doc" )
113
- with open (extracted_path , 'wb' ) as file :
114
- file .write (response .content )
115
- print (Fore .GREEN + "File was successfully saved" )
116
- elif href and href .lower ().endswith (('.ppt' )):
117
- filename = os .path .basename (href )
118
- extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .ppt" )
119
- with open (extracted_path , 'wb' ) as file :
120
- file .write (response .content )
121
- print (Fore .GREEN + "File was successfully saved" )
122
- elif href and href .lower ().endswith (('.xls' )):
123
- filename = os .path .basename (href )
124
- extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .xls" )
125
- with open (extracted_path , 'wb' ) as file :
126
- file .write (response .content )
127
- print (Fore .GREEN + "File was successfully saved" )
128
- elif href and href .lower ().endswith (('.json' )):
129
- filename = os .path .basename (href )
130
- extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .json" )
131
- with open (extracted_path , 'wb' ) as file :
132
- file .write (response .content )
133
- print (Fore .GREEN + "File was successfully saved" )
134
- elif href and href .lower ().endswith (('.txt' )):
135
- filename = os .path .basename (href )
136
- extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .txt" )
137
- with open (extracted_path , 'wb' ) as file :
138
- file .write (response .content )
139
- print (Fore .GREEN + "File was successfully saved" )
140
- print (Fore .LIGHTGREEN_EX + "-------------------------------------------------" )
63
+ if response .status_code == 200 :
64
+ accessible_subdomains += 1
65
+ soup = BeautifulSoup (response .content , 'html.parser' )
66
+ title = soup .title .string
67
+ emails = re .findall (email_pattern , soup .text )
68
+ total_emails .append (emails )
69
+ if not emails :
70
+ emails = ['None' ]
71
+ print (Fore .GREEN + "Page URL: " + Fore .LIGHTCYAN_EX + Style .BRIGHT + f"{ url } " + Style .RESET_ALL )
72
+ print (Fore .GREEN + "Page title: " + Fore .LIGHTCYAN_EX + Style .BRIGHT + f"{ title } " + Style .RESET_ALL )
73
+ print (Fore .GREEN + "Founded e-mails: " + Fore .LIGHTCYAN_EX + Style .BRIGHT + f"{ ', ' .join (emails )} " + Style .RESET_ALL )
74
+ links = soup .find_all ('a' )
75
+ for link in links :
76
+ href = link .get ('href' )
77
+ if href :
78
+ if href .lower ().endswith (('.docx' , '.xlsx' , '.csv' , '.pdf' , '.pptx' , '.doc' , '.ppt' , '.xls' , '.rtf' )):
79
+ document_url = 'http://' + url + href
80
+ print (Fore .GREEN + "Found document: " + Fore .LIGHTCYAN_EX + Style .BRIGHT + f"{ document_url } " + Style .RESET_ALL )
81
+ response = requests .get (document_url )
82
+ if response .status_code == 200 :
83
+ if href and href .lower ().endswith (('.docx' )):
84
+ filename = os .path .basename (href )
85
+ extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .docx" )
86
+ with open (extracted_path , 'wb' ) as file :
87
+ file .write (response .content )
88
+ files_counter += 1
89
+ print (Fore .GREEN + "File was successfully saved" )
90
+ elif href and href .lower ().endswith (('.xlsx' )):
91
+ filename = os .path .basename (href )
92
+ extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .xlsx" )
93
+ with open (extracted_path , 'wb' ) as file :
94
+ file .write (response .content )
95
+ files_counter += 1
96
+ print (Fore .GREEN + "File was successfully saved" )
97
+ elif href and href .lower ().endswith (('.pdf' )):
98
+ filename = os .path .basename (href )
99
+ extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .pdf" )
100
+ with open (extracted_path , 'wb' ) as file :
101
+ file .write (response .content )
102
+ files_counter += 1
103
+ print (Fore .GREEN + "File was successfully saved" )
104
+ elif href and href .lower ().endswith (('.csv' )):
105
+ filename = os .path .basename (href )
106
+ extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .csv" )
107
+ with open (extracted_path , 'wb' ) as file :
108
+ file .write (response .content )
109
+ files_counter += 1
110
+ print (Fore .GREEN + "File was successfully saved" )
111
+ elif href and href .lower ().endswith (('.pptx' )):
112
+ filename = os .path .basename (href )
113
+ extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .pptx" )
114
+ with open (extracted_path , 'wb' ) as file :
115
+ file .write (response .content )
116
+ files_counter += 1
117
+ print (Fore .GREEN + "File was successfully saved" )
118
+ elif href and href .lower ().endswith (('.doc' )):
119
+ filename = os .path .basename (href )
120
+ extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .doc" )
121
+ with open (extracted_path , 'wb' ) as file :
122
+ file .write (response .content )
123
+ files_counter += 1
124
+ print (Fore .GREEN + "File was successfully saved" )
125
+ elif href and href .lower ().endswith (('.ppt' )):
126
+ filename = os .path .basename (href )
127
+ extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .ppt" )
128
+ with open (extracted_path , 'wb' ) as file :
129
+ file .write (response .content )
130
+ files_counter += 1
131
+ print (Fore .GREEN + "File was successfully saved" )
132
+ elif href and href .lower ().endswith (('.xls' )):
133
+ filename = os .path .basename (href )
134
+ extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .xls" )
135
+ with open (extracted_path , 'wb' ) as file :
136
+ file .write (response .content )
137
+ files_counter += 1
138
+ print (Fore .GREEN + "File was successfully saved" )
139
+ elif href and href .lower ().endswith (('.json' )):
140
+ filename = os .path .basename (href )
141
+ extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .json" )
142
+ with open (extracted_path , 'wb' ) as file :
143
+ file .write (response .content )
144
+ files_counter += 1
145
+ print (Fore .GREEN + "File was successfully saved" )
146
+ elif href and href .lower ().endswith (('.txt' )):
147
+ filename = os .path .basename (href )
148
+ extracted_path = os .path .join (ps_docs_path , f"extracted_{ os .path .splitext (filename )[0 ]} .txt" )
149
+ with open (extracted_path , 'wb' ) as file :
150
+ file .write (response .content )
151
+ files_counter += 1
152
+ print (Fore .GREEN + "File was successfully saved" )
153
+ print (Fore .LIGHTGREEN_EX + "-------------------------------------------------" )
141
154
except Exception as e :
142
155
print (Fore .RED + "File extraction failed. Reason: {}" .format (e ) + Style .RESET_ALL )
143
156
print (Fore .LIGHTGREEN_EX + "-------------------------------------------------" + Style .RESET_ALL )
@@ -147,7 +160,7 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
147
160
clean_bad_pdfs (ps_docs_path )
148
161
149
162
if keywords_flag == 1 :
150
- print (Fore .GREEN + "Starting keywords searching in PDF files" + Style .RESET_ALL )
163
+ print (Fore .GREEN + "Searching keywords in PDF files... " + Style .RESET_ALL )
151
164
try :
152
165
pdf_results = find_keywords_in_pdfs (ps_docs_path , keywords )
153
166
for pdf_file , found_keywords in pdf_results .items ():
@@ -157,4 +170,8 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
157
170
elif keywords_flag == 0 :
158
171
print (Fore .RED + "Keywords gathering won't start because of None user input" + Style .RESET_ALL )
159
172
print (Fore .LIGHTGREEN_EX + "-------------------------------------------------" + Style .RESET_ALL )
173
+ print (Fore .GREEN + f"\n During PageSearch process:\n [+] Total { len (subdomains_list )} subdomains were checked" )
174
+ print (Fore .GREEN + f"[+] Among them, { accessible_subdomains } subdomains were accessible" )
175
+ print (Fore .GREEN + f"[+] In result, { len (ps_emails_return )} unique e-mail addresses were found" )
176
+ print (Fore .GREEN + f"[+] Also, { files_counter } files were extracted" )
160
177
return ps_emails_return
0 commit comments