Skip to content

Commit 77a4978

Browse files
Created pagesearch_parsers.py [added basic PageSearch functionality]
1 parent a6cb100 commit 77a4978

File tree

1 file changed

+90
-0
lines changed

1 file changed

+90
-0
lines changed

pagesearch/pagesearch_parsers.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import re
4+
from colorama import Fore, Style
5+
import os
6+
def subdomains_parser(subdomains_list, report_folder):
7+
ps_docs_path = report_folder + '//ps_documents'
8+
if not os.path.exists(ps_docs_path):
9+
os.makedirs(ps_docs_path)
10+
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
11+
counter = 1
12+
for url in subdomains_list:
13+
try:
14+
response = requests.get('http://' + url)
15+
soup = BeautifulSoup(response.content, 'html.parser')
16+
title = soup.title.string
17+
emails = re.findall(email_pattern, soup.text)
18+
print(Fore.GREEN + f"Page URL: {url}" + Style.RESET_ALL)
19+
print(Fore.GREEN + f"Page title: {title}" + Style.RESET_ALL)
20+
print(Fore.GREEN + f"Founded e-mails: {', '.join(emails)}" + Style.RESET_ALL)
21+
links = soup.find_all('a')
22+
for link in links:
23+
href = link.get('href')
24+
if href:
25+
#print(f"Found link: {href}") # Debugging line
26+
if href.lower().endswith(('.docx', '.xlsx', '.csv', '.pdf', '.pptx', '.doc', '.ppt', '.xls', '.rtf')):
27+
document_url = 'http://' + url + href
28+
print(Fore.GREEN + f"Found document: {document_url}" + Style.RESET_ALL)
29+
response = requests.get(document_url)
30+
if response.status_code == 200:
31+
if href and href.lower().endswith(('.docx')):
32+
filename = os.path.basename(href)
33+
extracted_path = os.path.join(ps_docs_path, f"extracted_{counter}_{os.path.splitext(filename)[0]}.docx")
34+
with open(extracted_path, 'wb') as file:
35+
file.write(response.content)
36+
print(Fore.GREEN + f"File {filename} was successfully saved")
37+
elif href and href.lower().endswith(('.xlsx')):
38+
filename = os.path.basename(href)
39+
extracted_path = os.path.join(ps_docs_path, f"extracted_{counter}_{os.path.splitext(filename)[0]}.xlsx")
40+
with open(extracted_path, 'wb') as file:
41+
file.write(response.content)
42+
print(Fore.GREEN + f"File {filename} was successfully saved")
43+
elif href and href.lower().endswith(('.pdf')):
44+
filename = os.path.basename(href)
45+
extracted_path = os.path.join(ps_docs_path, f"extracted_{counter}_{os.path.splitext(filename)[0]}.pdf")
46+
with open(extracted_path, 'wb') as file:
47+
file.write(response.content)
48+
print(Fore.GREEN + f"File {filename} was successfully saved")
49+
elif href and href.lower().endswith(('.csv')):
50+
filename = os.path.basename(href)
51+
extracted_path = os.path.join(ps_docs_path, f"extracted_{counter}_{os.path.splitext(filename)[0]}.csv")
52+
with open(extracted_path, 'wb') as file:
53+
file.write(response.content)
54+
print(Fore.GREEN + f"File {filename} was successfully saved")
55+
elif href and href.lower().endswith(('.pptx')):
56+
filename = os.path.basename(href)
57+
extracted_path = os.path.join(ps_docs_path, f"extracted_{counter}_{os.path.splitext(filename)[0]}.pptx")
58+
with open(extracted_path, 'wb') as file:
59+
file.write(response.content)
60+
print(Fore.GREEN + f"File {filename} was successfully saved")
61+
elif href and href.lower().endswith(('.doc')):
62+
filename = os.path.basename(href)
63+
extracted_path = os.path.join(ps_docs_path, f"extracted_{counter}_{os.path.splitext(filename)[0]}.doc")
64+
with open(extracted_path, 'wb') as file:
65+
file.write(response.content)
66+
print(Fore.GREEN + f"File {filename} was successfully saved")
67+
elif href and href.lower().endswith(('.ppt')):
68+
filename = os.path.basename(href)
69+
extracted_path = os.path.join(ps_docs_path, f"extracted_{counter}_{os.path.splitext(filename)[0]}.ppt")
70+
with open(extracted_path, 'wb') as file:
71+
file.write(response.content)
72+
print(Fore.GREEN + f"File {filename} was successfully saved")
73+
elif href and href.lower().endswith(('.xls')):
74+
filename = os.path.basename(href)
75+
extracted_path = os.path.join(ps_docs_path, f"extracted_{counter}_{os.path.splitext(filename)[0]}.xls")
76+
with open(extracted_path, 'wb') as file:
77+
file.write(response.content)
78+
print(Fore.GREEN + f"File {filename} was successfully saved")
79+
elif href and href.lower().endswith(('.rtf')):
80+
filename = os.path.basename(href)
81+
extracted_path = os.path.join(ps_docs_path, f"extracted_{counter}_{os.path.splitext(filename)[0]}.rtf")
82+
with open(extracted_path, 'wb') as file:
83+
file.write(response.content)
84+
print(Fore.GREEN + f"File {filename} was successfully saved")
85+
else:
86+
print(Fore.RED + "Error" + Style.RESET_ALL)
87+
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")
88+
except Exception as e:
89+
print(Fore.RED + "Error {}".format(e) + Style.RESET_ALL)
90+
pass

0 commit comments

Comments
 (0)