|
3 | 3 | import re
|
4 | 4 | from colorama import Fore, Style
|
5 | 5 | import os
|
6 |
| -def subdomains_parser(subdomains_list, report_folder): |
| 6 | +import fitz |
| 7 | + |
| 8 | +def extract_text_from_pdf(filename: str) -> str: |
| 9 | + try: |
| 10 | + doc = fitz.open(filename=filename) |
| 11 | + text = "" |
| 12 | + for page in doc: |
| 13 | + text += page.get_text() |
| 14 | + return text |
| 15 | + except Exception as e: |
| 16 | + print(Fore.RED + f"Can't open some PDF file. Reason: {e}" + Style.RESET_ALL) |
| 17 | + pass |
| 18 | + |
| 19 | +def find_keywords_in_pdfs(ps_docs_path, keywords: list) -> dict: |
| 20 | + try: |
| 21 | + pdf_files = [f for f in os.listdir(ps_docs_path) if f.lower().endswith(".pdf")] |
| 22 | + results = {} |
| 23 | + for pdf_file in pdf_files: |
| 24 | + pdf_path = os.path.join(ps_docs_path, pdf_file) |
| 25 | + extracted_text = extract_text_from_pdf(pdf_path) |
| 26 | + for keyword in keywords: |
| 27 | + if keyword.lower() in extracted_text.lower(): |
| 28 | + if pdf_file not in results: |
| 29 | + results[pdf_file] = [] |
| 30 | + results[pdf_file].append(keyword) |
| 31 | + return results |
| 32 | + except Exception as e: |
| 33 | + print(Fore.RED + f"Can't find keywords. Reason: {e}") |
| 34 | + pass |
| 35 | + |
| 36 | +def subdomains_parser(subdomains_list, report_folder, keywords): |
7 | 37 | ps_docs_path = report_folder + '//ps_documents'
|
8 | 38 | if not os.path.exists(ps_docs_path):
|
9 | 39 | os.makedirs(ps_docs_path)
|
@@ -88,3 +118,10 @@ def subdomains_parser(subdomains_list, report_folder):
|
88 | 118 | print(Fore.RED + "File extraction failed. Reason: {}".format(e) + Style.RESET_ALL)
|
89 | 119 | print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")
|
90 | 120 | pass
|
| 121 | + try: |
| 122 | + pdf_results = find_keywords_in_pdfs(ps_docs_path, keywords) |
| 123 | + for pdf_file, found_keywords in pdf_results.items(): |
| 124 | + print(Fore.GREEN + f"Keywords " + Fore.RESET + f"{', '.join(found_keywords)}" + Fore.GREEN + f" found in '{pdf_file}'") |
| 125 | + except Exception as e: |
| 126 | + print(Fore.RED + f"Can't find keywords. Reason: {e}") |
| 127 | + print(Fore.LIGHTGREEN_EX + "-------------------------------------------------") |
0 commit comments