Add beautiful_soup_parser option (#206)

vincentkelleher · AlexVonB · web-flow · commit 2d654a6b7e82 · 2025-03-29T11:29:29.000+01:00
* add beautiful_soup_parser option
* add Beautiful Soup parser argument to command line

---------

Co-authored-by: Vincent Kelleher &lt;vincent.kelleher-ext@francetravail.fr&gt;
Co-authored-by: AlexVonB &lt;AlexVonB@users.noreply.github.com&gt;
diff --git a/README.rst b/README.rst
@@ -157,6 +157,13 @@ strip_document
   within the document are unaffected.
   Defaults to ``STRIP``.
 
+beautiful_soup_parser
+  Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
+  as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
+  environment. Defaults to ``html.parser``.
+
+.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
+
 Options may be specified as kwargs to the ``markdownify`` function, or as a
 nested ``Options`` class in ``MarkdownConverter`` subclasses.
 
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
@@ -154,6 +154,7 @@ def _next_block_content_sibling(el):
 class MarkdownConverter(object):
     class DefaultOptions:
         autolinks = True
+        beautiful_soup_parser = 'html.parser'
         bullets = '*+-'  # An iterable of bullet types.
         code_language = ''
         code_language_callback = None
@@ -191,7 +192,7 @@ def __init__(self, **options):
         self.convert_fn_cache = {}
 
     def convert(self, html):
-        soup = BeautifulSoup(html, 'html.parser')
+        soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
         return self.convert_soup(soup)
 
     def convert_soup(self, soup):
diff --git a/markdownify/main.py b/markdownify/main.py
@@ -55,7 +55,9 @@ def main(argv=sys.argv[1:]):
     parser.add_argument('--no-escape-underscores', dest='escape_underscores',
                         action='store_false',
                         help="Do not escape '_' to '\\_' in text.")
-    parser.add_argument('-i', '--keep-inline-images-in', nargs='*',
+    parser.add_argument('-i', '--keep-inline-images-in',
+                        default=[],
+                        nargs='*',
                         help="Images are converted to their alt-text when the images are "
                         "located inside headlines or table cells. If some inline images "
                         "should be converted to markdown images instead, this option can "
@@ -68,6 +70,12 @@ def main(argv=sys.argv[1:]):
     parser.add_argument('-w', '--wrap', action='store_true',
                         help="Wrap all text paragraphs at --wrap-width characters.")
     parser.add_argument('--wrap-width', type=int, default=80)
+    parser.add_argument('-p', '--beautiful-soup-parser',
+                        dest='beautiful_soup_parser',
+                        default='html.parser',
+                        help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
+                             "as html5lib, lxml or even a custom parser as long as it is installed on the execution "
+                             "environment.")
 
     args = parser.parse_args(argv)
     print(markdownify(**vars(args)))