|
| 1 | +# mypy: disable-error-code="no-untyped-call,no-untyped-def" |
| 2 | +import argparse |
| 3 | +import atexit |
| 4 | +import base64 |
| 5 | +import os.path |
| 6 | +import sys |
| 7 | +import tempfile |
| 8 | +from pathlib import Path |
| 9 | +from shutil import copy |
| 10 | +from typing import Optional, List |
| 11 | + |
| 12 | +import requests |
| 13 | +from requests import Response |
| 14 | +from selenium import webdriver |
| 15 | +from selenium.webdriver.chrome.options import Options |
| 16 | +from selenium.webdriver.chrome.service import Service |
| 17 | +from webdriver_manager.chrome import ChromeDriverManager |
| 18 | +from webdriver_manager.core.download_manager import WDMDownloadManager |
| 19 | +from webdriver_manager.core.driver import Driver |
| 20 | +from webdriver_manager.core.driver_cache import DriverCacheManager |
| 21 | +from webdriver_manager.core.file_manager import FileManager |
| 22 | +from webdriver_manager.core.http import HttpClient |
| 23 | +from webdriver_manager.core.os_manager import OperationSystemManager |
| 24 | + |
| 25 | +__version__ = "0.0.1" |
| 26 | + |
| 27 | +# HTML2PDF.js prints unicode symbols to console. The following makes it work on |
| 28 | +# Windows which otherwise complains: |
| 29 | +# UnicodeEncodeError: 'charmap' codec can't encode characters in position 129-130: character maps to <undefined> |
| 30 | +# How to make python 3 print() utf8 |
| 31 | +# https://stackoverflow.com/questions/3597480/how-to-make-python-3-print-utf8 |
| 32 | +sys.stdout = open(sys.stdout.fileno(), mode="w", encoding="utf8", closefd=False) |
| 33 | + |
| 34 | + |
| 35 | +class HTML2PDF_HTTPClient(HttpClient): |
| 36 | + def get(self, url, params=None, **kwargs) -> Response: |
| 37 | + """ |
| 38 | + Add you own logic here like session or proxy etc. |
| 39 | + """ |
| 40 | + last_error: Optional[Exception] = None |
| 41 | + for attempt in range(1, 3): |
| 42 | + print( # noqa: T201 |
| 43 | + f"HTML2PDF_HTTPClient: sending GET request attempt {attempt}: {url}" |
| 44 | + ) |
| 45 | + try: |
| 46 | + return requests.get(url, params, timeout=(5, 5), **kwargs) |
| 47 | + except requests.exceptions.ConnectTimeout as connect_timeout_: |
| 48 | + last_error = connect_timeout_ |
| 49 | + except requests.exceptions.ReadTimeout as read_timeout_: |
| 50 | + last_error = read_timeout_ |
| 51 | + except Exception as exception_: |
| 52 | + raise AssertionError( |
| 53 | + "HTML2PDF_HTTPClient: unknown exception", exception_ |
| 54 | + ) from None |
| 55 | + print( # noqa: T201 |
| 56 | + f"HTML2PDF_HTTPClient: " |
| 57 | + f"failed to get response for URL: {url} with error: {last_error}" |
| 58 | + ) |
| 59 | + |
| 60 | + |
| 61 | +class HTML2PDF_CacheManager(DriverCacheManager): |
| 62 | + def __init__(self, file_manager: FileManager, path_to_cache_dir: str): |
| 63 | + super().__init__(file_manager=file_manager) |
| 64 | + self.path_to_cache_dir: str = path_to_cache_dir |
| 65 | + |
| 66 | + def find_driver(self, driver: Driver): |
| 67 | + path_to_cached_chrome_driver_dir = os.path.join( |
| 68 | + self.path_to_cache_dir, "chromedriver" |
| 69 | + ) |
| 70 | + |
| 71 | + os_type = self.get_os_type() |
| 72 | + browser_type = driver.get_browser_type() |
| 73 | + browser_version = self._os_system_manager.get_browser_version_from_os( |
| 74 | + browser_type |
| 75 | + ) |
| 76 | + |
| 77 | + path_to_cached_chrome_driver_dir = os.path.join( |
| 78 | + path_to_cached_chrome_driver_dir, browser_version, os_type |
| 79 | + ) |
| 80 | + path_to_cached_chrome_driver = os.path.join( |
| 81 | + path_to_cached_chrome_driver_dir, "chromedriver" |
| 82 | + ) |
| 83 | + if os.path.isfile(path_to_cached_chrome_driver): |
| 84 | + print( # noqa: T201 |
| 85 | + f"HTML2PDF_CacheManager: chromedriver exists in StrictDoc's local cache: " |
| 86 | + f"{path_to_cached_chrome_driver}" |
| 87 | + ) |
| 88 | + return path_to_cached_chrome_driver |
| 89 | + print( # noqa: T201 |
| 90 | + f"HTML2PDF_CacheManager: chromedriver does not exist in StrictDoc's local cache: " |
| 91 | + f"{path_to_cached_chrome_driver}" |
| 92 | + ) |
| 93 | + path_to_downloaded_chrome_driver = super().find_driver(driver) |
| 94 | + if path_to_downloaded_chrome_driver is None: |
| 95 | + print( # noqa: T201 |
| 96 | + f"HTML2PDF_CacheManager: could not get a downloaded Chrome driver: " |
| 97 | + f"{path_to_cached_chrome_driver}" |
| 98 | + ) |
| 99 | + return None |
| 100 | + |
| 101 | + print( # noqa: T201 |
| 102 | + f"HTML2PDF_CacheManager: saving chromedriver to StrictDoc's local cache: " |
| 103 | + f"{path_to_downloaded_chrome_driver} -> {path_to_cached_chrome_driver}" |
| 104 | + ) |
| 105 | + Path(path_to_cached_chrome_driver_dir).mkdir( |
| 106 | + parents=True, exist_ok=True |
| 107 | + ) |
| 108 | + copy(path_to_downloaded_chrome_driver, path_to_cached_chrome_driver) |
| 109 | + |
| 110 | + return path_to_cached_chrome_driver |
| 111 | + |
| 112 | + |
| 113 | +def get_inches_from_millimeters(mm: float) -> float: |
| 114 | + return mm / 25.4 |
| 115 | + |
| 116 | + |
| 117 | +def get_pdf_from_html(driver, url) -> bytes: |
| 118 | + print(f"HTML2PDF: opening URL with Chrome Driver: {url}") # noqa: T201 |
| 119 | + |
| 120 | + driver.get(url) |
| 121 | + |
| 122 | + # https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-printToPDF |
| 123 | + calculated_print_options = { |
| 124 | + "landscape": False, |
| 125 | + "displayHeaderFooter": False, |
| 126 | + "printBackground": True, |
| 127 | + # This is an experimental feature that generates a document outline |
| 128 | + # (table of contents). |
| 129 | + "generateDocumentOutline": True, |
| 130 | + # Whether to prefer page size as defined by css. Defaults to |
| 131 | + # false, in which case the content will be scaled to fit the paper size. |
| 132 | + "preferCSSPageSize": True, |
| 133 | + # Paper width in inches. Defaults to 8.5 inches. |
| 134 | + "paperWidth": get_inches_from_millimeters(210), |
| 135 | + # Paper height in inches. Defaults to 11 inches. |
| 136 | + "paperHeight": get_inches_from_millimeters(297), |
| 137 | + # WIP: Changing the margin settings has no effect. |
| 138 | + # Top margin in inches. Defaults to 1cm (~0.4 inches). |
| 139 | + "marginTop": get_inches_from_millimeters(12), |
| 140 | + # Bottom margin in inches. Defaults to 1cm (~0.4 inches). |
| 141 | + "marginBottom": get_inches_from_millimeters(12), |
| 142 | + # Left margin in inches. Defaults to 1cm (~0.4 inches). |
| 143 | + "marginLeft": get_inches_from_millimeters(21), |
| 144 | + # Right margin in inches. Defaults to 1cm (~0.4 inches). |
| 145 | + "marginRight": get_inches_from_millimeters(21), |
| 146 | + } |
| 147 | + |
| 148 | + print("HTML2PDF: executing print command with Chrome Driver.") # noqa: T201 |
| 149 | + result = driver.execute_cdp_cmd("Page.printToPDF", calculated_print_options) |
| 150 | + |
| 151 | + print("HTML2PDF: JS logs from the print session:") # noqa: T201 |
| 152 | + print('"""') # noqa: T201 |
| 153 | + for entry in driver.get_log("browser"): |
| 154 | + print(entry) # noqa: T201 |
| 155 | + print('"""') # noqa: T201 |
| 156 | + |
| 157 | + data = base64.b64decode(result["data"]) |
| 158 | + return data |
| 159 | + |
| 160 | + |
| 161 | +def create_webdriver(chromedriver: Optional[str], path_to_cache_dir: str): |
| 162 | + print("HTML2PDF: creating Chrome Driver service.", flush=True) # noqa: T201 |
| 163 | + if chromedriver is None: |
| 164 | + cache_manager = HTML2PDF_CacheManager( |
| 165 | + file_manager=FileManager( |
| 166 | + os_system_manager=OperationSystemManager() |
| 167 | + ), |
| 168 | + path_to_cache_dir=path_to_cache_dir, |
| 169 | + ) |
| 170 | + |
| 171 | + http_client = HTML2PDF_HTTPClient() |
| 172 | + download_manager = WDMDownloadManager(http_client) |
| 173 | + path_to_chrome = ChromeDriverManager( |
| 174 | + download_manager=download_manager, cache_manager=cache_manager |
| 175 | + ).install() |
| 176 | + else: |
| 177 | + path_to_chrome = chromedriver |
| 178 | + print(f"HTML2PDF: Chrome Driver available at path: {path_to_chrome}") # noqa: T201 |
| 179 | + |
| 180 | + service = Service(path_to_chrome) |
| 181 | + |
| 182 | + webdriver_options = Options() |
| 183 | + webdriver_options.add_argument("start-maximized") |
| 184 | + webdriver_options.add_argument("disable-infobars") |
| 185 | + webdriver_options.add_argument("--headless") |
| 186 | + webdriver_options.add_argument("--disable-extensions") |
| 187 | + |
| 188 | + webdriver_options.add_experimental_option("useAutomationExtension", False) |
| 189 | + webdriver_options.add_experimental_option( |
| 190 | + "excludeSwitches", ["enable-automation"] |
| 191 | + ) |
| 192 | + |
| 193 | + # Enable the capturing of everything in JS console. |
| 194 | + webdriver_options.set_capability("goog:loggingPrefs", {"browser": "ALL"}) |
| 195 | + |
| 196 | + print("HTML2PDF: creating Chrome Driver.", flush=True) # noqa: T201 |
| 197 | + |
| 198 | + driver = webdriver.Chrome( |
| 199 | + options=webdriver_options, |
| 200 | + service=service, |
| 201 | + ) |
| 202 | + driver.set_page_load_timeout(60) |
| 203 | + |
| 204 | + return driver |
| 205 | + |
| 206 | + |
| 207 | +def main(): |
| 208 | + # By default, all driver binaries are saved to user.home/.wdm folder. |
| 209 | + # You can override this setting and save binaries to project.root/.wdm. |
| 210 | + os.environ["WDM_LOCAL"] = "1" |
| 211 | + |
| 212 | + parser = argparse.ArgumentParser(description="HTML2PDF printer script.") |
| 213 | + parser.add_argument( |
| 214 | + "--chromedriver", |
| 215 | + type=str, |
| 216 | + help="Optional chromedriver path. Downloaded if not given.", |
| 217 | + ) |
| 218 | + parser.add_argument( |
| 219 | + "--cache-dir", |
| 220 | + type=str, |
| 221 | + help="Optional path to a cache directory whereto the Chrome driver is downloaded.", |
| 222 | + ) |
| 223 | + parser.add_argument("paths", nargs='+', help="Paths to input HTML file.") |
| 224 | + args = parser.parse_args() |
| 225 | + |
| 226 | + paths: List[str] = args.paths |
| 227 | + |
| 228 | + path_to_cache_dir: str = ( |
| 229 | + args.cache_dir |
| 230 | + if args.cache_dir is not None |
| 231 | + else ( |
| 232 | + os.path.join( |
| 233 | + tempfile.gettempdir(), "strictdoc_cache", "chromedriver" |
| 234 | + ) |
| 235 | + ) |
| 236 | + ) |
| 237 | + driver = create_webdriver(args.chromedriver, path_to_cache_dir) |
| 238 | + |
| 239 | + @atexit.register |
| 240 | + def exit_handler(): |
| 241 | + print("HTML2PDF: exit handler: quitting the Chrome Driver.") # noqa: T201 |
| 242 | + driver.quit() |
| 243 | + |
| 244 | + for separate_path_pair_ in paths: |
| 245 | + path_to_input_html, path_to_output_pdf = separate_path_pair_.split(":") |
| 246 | + assert os.path.isfile(path_to_input_html), path_to_input_html |
| 247 | + |
| 248 | + path_to_output_pdf_dir = os.path.dirname(path_to_output_pdf) |
| 249 | + Path(path_to_output_pdf_dir).mkdir(parents=True, exist_ok=True) |
| 250 | + |
| 251 | + url = Path(os.path.abspath(path_to_input_html)).as_uri() |
| 252 | + |
| 253 | + pdf_bytes = get_pdf_from_html(driver, url) |
| 254 | + with open(path_to_output_pdf, "wb") as f: |
| 255 | + f.write(pdf_bytes) |
| 256 | + |
| 257 | + |
| 258 | +if __name__ == "__main__": |
| 259 | + main() |
0 commit comments