diff --git a/README.rst b/README.rst index c6c6d84..71f6b14 100644 --- a/README.rst +++ b/README.rst @@ -157,6 +157,13 @@ strip_document within the document are unaffected. Defaults to ``STRIP``. +beautiful_soup_parser + Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such + as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution + environment. Defaults to ``html.parser``. + +.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/ + Options may be specified as kwargs to the ``markdownify`` function, or as a nested ``Options`` class in ``MarkdownConverter`` subclasses. diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 7f69bfe..25196e6 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -154,6 +154,7 @@ def _next_block_content_sibling(el): class MarkdownConverter(object): class DefaultOptions: autolinks = True + beautiful_soup_parser = 'html.parser' bullets = '*+-' # An iterable of bullet types. code_language = '' code_language_callback = None @@ -191,7 +192,7 @@ def __init__(self, **options): self.convert_fn_cache = {} def convert(self, html): - soup = BeautifulSoup(html, 'html.parser') + soup = BeautifulSoup(html, self.options['beautiful_soup_parser']) return self.convert_soup(soup) def convert_soup(self, soup): diff --git a/markdownify/main.py b/markdownify/main.py index 432efb5..2fc02f8 100644 --- a/markdownify/main.py +++ b/markdownify/main.py @@ -55,7 +55,9 @@ def main(argv=sys.argv[1:]): parser.add_argument('--no-escape-underscores', dest='escape_underscores', action='store_false', help="Do not escape '_' to '\\_' in text.") - parser.add_argument('-i', '--keep-inline-images-in', nargs='*', + parser.add_argument('-i', '--keep-inline-images-in', + default=[], + nargs='*', help="Images are converted to their alt-text when the images are " "located inside headlines or table cells. If some inline images " "should be converted to markdown images instead, this option can " @@ -68,6 +70,12 @@ def main(argv=sys.argv[1:]): parser.add_argument('-w', '--wrap', action='store_true', help="Wrap all text paragraphs at --wrap-width characters.") parser.add_argument('--wrap-width', type=int, default=80) + parser.add_argument('-p', '--beautiful-soup-parser', + dest='beautiful_soup_parser', + default='html.parser', + help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such " + "as html5lib, lxml or even a custom parser as long as it is installed on the execution " + "environment.") args = parser.parse_args(argv) print(markdownify(**vars(args)))