Skip to content

Commit c7abb31

Browse files
Added PDF keywords search function
1 parent 3b3b833 commit c7abb31

File tree

1 file changed

+38
-1
lines changed

1 file changed

+38
-1
lines changed

pagesearch/pagesearch_parsers.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,37 @@
33
import re
44
from colorama import Fore, Style
55
import os
6-
def subdomains_parser(subdomains_list, report_folder):
6+
import fitz
7+
8+
def extract_text_from_pdf(filename: str) -> str:
9+
try:
10+
doc = fitz.open(filename=filename)
11+
text = ""
12+
for page in doc:
13+
text += page.get_text()
14+
return text
15+
except Exception as e:
16+
print(Fore.RED + f"Can't open some PDF file. Reason: {e}" + Style.RESET_ALL)
17+
pass
18+
19+
def find_keywords_in_pdfs(ps_docs_path, keywords: list) -> dict:
20+
try:
21+
pdf_files = [f for f in os.listdir(ps_docs_path) if f.lower().endswith(".pdf")]
22+
results = {}
23+
for pdf_file in pdf_files:
24+
pdf_path = os.path.join(ps_docs_path, pdf_file)
25+
extracted_text = extract_text_from_pdf(pdf_path)
26+
for keyword in keywords:
27+
if keyword.lower() in extracted_text.lower():
28+
if pdf_file not in results:
29+
results[pdf_file] = []
30+
results[pdf_file].append(keyword)
31+
return results
32+
except Exception as e:
33+
print(Fore.RED + f"Can't find keywords. Reason: {e}")
34+
pass
35+
36+
def subdomains_parser(subdomains_list, report_folder, keywords):
737
ps_docs_path = report_folder + '//ps_documents'
838
if not os.path.exists(ps_docs_path):
939
os.makedirs(ps_docs_path)
@@ -88,3 +118,10 @@ def subdomains_parser(subdomains_list, report_folder):
88118
print(Fore.RED + "File extraction failed. Reason: {}".format(e) + Style.RESET_ALL)
89119
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")
90120
pass
121+
try:
122+
pdf_results = find_keywords_in_pdfs(ps_docs_path, keywords)
123+
for pdf_file, found_keywords in pdf_results.items():
124+
print(Fore.GREEN + f"Keywords " + Fore.RESET + f"{', '.join(found_keywords)}" + Fore.GREEN + f" found in '{pdf_file}'")
125+
except Exception as e:
126+
print(Fore.RED + f"Can't find keywords. Reason: {e}")
127+
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")

0 commit comments

Comments
 (0)