allow BeautifulSoup configuration kwargs to be specified (#224)

chrispy-snps · web-flow · commit 75ab3064dd8f · 2025-06-14T09:06:22.000-04:00
Signed-off-by: chrispy &lt;chrispy@synopsys.com&gt;
diff --git a/README.rst b/README.rst
@@ -157,12 +157,16 @@ strip_document
   within the document are unaffected.
   Defaults to ``STRIP``.
 
-beautiful_soup_parser
-  Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
-  as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
-  environment. Defaults to ``html.parser``.
-
-.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
+bs4_options
+  Specify additional configuration options for the ``BeautifulSoup`` object
+  used to interpret the HTML markup. String and list values (such as ``lxml``
+  or ``html5lib``) are treated as ``features`` arguments to control parser
+  selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
+  are treated as full kwargs to be used for the BeautifulSoup constructor,
+  allowing specification of any parameter. For parameter details, see the
+  Beautiful Soup documentation at:
+
+.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
 
 Options may be specified as kwargs to the ``markdownify`` function, or as a
 nested ``Options`` class in ``MarkdownConverter`` subclasses.
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
@@ -154,7 +154,7 @@ def _next_block_content_sibling(el):
 class MarkdownConverter(object):
     class DefaultOptions:
         autolinks = True
-        beautiful_soup_parser = 'html.parser'
+        bs4_options = 'html.parser'
         bullets = '*+-'  # An iterable of bullet types.
         code_language = ''
         code_language_callback = None
@@ -188,11 +188,15 @@ def __init__(self, **options):
             raise ValueError('You may specify either tags to strip or tags to'
                              ' convert, but not both.')
 
+        # If a string or list is passed to bs4_options, assume it is a 'features' specification
+        if not isinstance(self.options['bs4_options'], dict):
+            self.options['bs4_options'] = {'features': self.options['bs4_options']}
+
         # Initialize the conversion function cache
         self.convert_fn_cache = {}
 
     def convert(self, html):
-        soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
+        soup = BeautifulSoup(html, **self.options['bs4_options'])
         return self.convert_soup(soup)
 
     def convert_soup(self, soup):
diff --git a/markdownify/main.py b/markdownify/main.py
@@ -70,12 +70,11 @@ def main(argv=sys.argv[1:]):
     parser.add_argument('-w', '--wrap', action='store_true',
                         help="Wrap all text paragraphs at --wrap-width characters.")
     parser.add_argument('--wrap-width', type=int, default=80)
-    parser.add_argument('-p', '--beautiful-soup-parser',
-                        dest='beautiful_soup_parser',
+    parser.add_argument('--bs4-options',
                         default='html.parser',
-                        help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
-                             "as html5lib, lxml or even a custom parser as long as it is installed on the execution "
-                             "environment.")
+                        help="Specifies the parser that BeautifulSoup should use to parse "
+                             "the HTML markup. Examples include 'html5.parser', 'lxml', and "
+                             "'html5lib'.")
 
     args = parser.parse_args(argv)
     print(markdownify(**vars(args)))
diff --git a/tests/test_args.py b/tests/test_args.py
@@ -32,3 +32,9 @@ def test_strip_document():
     assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
     assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
     assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
+
+
+def bs4_options():
+    assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
+    assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
+    assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"