diff --git a/README.rst b/README.rst index 71f6b14..946c83d 100644 --- a/README.rst +++ b/README.rst @@ -157,12 +157,16 @@ strip_document within the document are unaffected. Defaults to ``STRIP``. -beautiful_soup_parser - Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such - as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution - environment. Defaults to ``html.parser``. - -.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/ +bs4_options + Specify additional configuration options for the ``BeautifulSoup`` object + used to interpret the HTML markup. String and list values (such as ``lxml`` + or ``html5lib``) are treated as ``features`` arguments to control parser + selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``) + are treated as full kwargs to be used for the BeautifulSoup constructor, + allowing specification of any parameter. For parameter details, see the + Beautiful Soup documentation at: + +.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ Options may be specified as kwargs to the ``markdownify`` function, or as a nested ``Options`` class in ``MarkdownConverter`` subclasses. diff --git a/markdownify/__init__.py b/markdownify/__init__.py index c732711..b219ca2 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -154,7 +154,7 @@ def _next_block_content_sibling(el): class MarkdownConverter(object): class DefaultOptions: autolinks = True - beautiful_soup_parser = 'html.parser' + bs4_options = 'html.parser' bullets = '*+-' # An iterable of bullet types. code_language = '' code_language_callback = None @@ -188,11 +188,15 @@ def __init__(self, **options): raise ValueError('You may specify either tags to strip or tags to' ' convert, but not both.') + # If a string or list is passed to bs4_options, assume it is a 'features' specification + if not isinstance(self.options['bs4_options'], dict): + self.options['bs4_options'] = {'features': self.options['bs4_options']} + # Initialize the conversion function cache self.convert_fn_cache = {} def convert(self, html): - soup = BeautifulSoup(html, self.options['beautiful_soup_parser']) + soup = BeautifulSoup(html, **self.options['bs4_options']) return self.convert_soup(soup) def convert_soup(self, soup): diff --git a/markdownify/main.py b/markdownify/main.py old mode 100644 new mode 100755 index 2fc02f8..ba70671 --- a/markdownify/main.py +++ b/markdownify/main.py @@ -70,12 +70,11 @@ def main(argv=sys.argv[1:]): parser.add_argument('-w', '--wrap', action='store_true', help="Wrap all text paragraphs at --wrap-width characters.") parser.add_argument('--wrap-width', type=int, default=80) - parser.add_argument('-p', '--beautiful-soup-parser', - dest='beautiful_soup_parser', + parser.add_argument('--bs4-options', default='html.parser', - help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such " - "as html5lib, lxml or even a custom parser as long as it is installed on the execution " - "environment.") + help="Specifies the parser that BeautifulSoup should use to parse " + "the HTML markup. Examples include 'html5.parser', 'lxml', and " + "'html5lib'.") args = parser.parse_args(argv) print(markdownify(**vars(args))) diff --git a/tests/test_args.py b/tests/test_args.py index 301c19f..1ba6482 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -32,3 +32,9 @@ def test_strip_document(): assert markdownify("

Hello

", strip_document=RSTRIP) == "\n\nHello" assert markdownify("

Hello

", strip_document=STRIP) == "Hello" assert markdownify("

Hello

", strip_document=None) == "\n\nHello\n\n" + + +def bs4_options(): + assert markdownify("

Hello

", bs4_options="html.parser") == "Hello" + assert markdownify("

Hello

", bs4_options=["html.parser"]) == "Hello" + assert markdownify("

Hello

", bs4_options={"features": "html.parser"}) == "Hello"