From ef1750819eb17fd30ef3b40f1adf54fef7e1aeca Mon Sep 17 00:00:00 2001 From: Vincent Kelleher Date: Mon, 17 Mar 2025 15:48:04 +0100 Subject: [PATCH 1/4] add beautiful_soup_parser option --- README.rst | 5 +++++ markdownify/__init__.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index c6c6d84..744afbb 100644 --- a/README.rst +++ b/README.rst @@ -73,6 +73,11 @@ heading_style ``ATX_CLOSED``, ``SETEXT``, and ``UNDERLINED`` (which is an alias for ``SETEXT``). Defaults to ``UNDERLINED``. +beautiful_soup_parser + Specifies the HTML parser to be used by [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/). + Values such as ``html5lib``, ``lxml`` can be used but require those parser to be installed in your project. + Defaults to ``html.parser``. + bullets An iterable (string, list, or tuple) of bullet styles to be used. If the iterable only contains one item, it will be used regardless of how deeply diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 7f69bfe..25196e6 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -154,6 +154,7 @@ def _next_block_content_sibling(el): class MarkdownConverter(object): class DefaultOptions: autolinks = True + beautiful_soup_parser = 'html.parser' bullets = '*+-' # An iterable of bullet types. code_language = '' code_language_callback = None @@ -191,7 +192,7 @@ def __init__(self, **options): self.convert_fn_cache = {} def convert(self, html): - soup = BeautifulSoup(html, 'html.parser') + soup = BeautifulSoup(html, self.options['beautiful_soup_parser']) return self.convert_soup(soup) def convert_soup(self, soup): From 2613deeea501bf4b5ee49f96b559217a798f0351 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Fri, 21 Mar 2025 16:35:40 +0100 Subject: [PATCH 2/4] Readme: break lines --- README.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 744afbb..b16c5a4 100644 --- a/README.rst +++ b/README.rst @@ -74,9 +74,10 @@ heading_style ``SETEXT``). Defaults to ``UNDERLINED``. beautiful_soup_parser - Specifies the HTML parser to be used by [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/). - Values such as ``html5lib``, ``lxml`` can be used but require those parser to be installed in your project. - Defaults to ``html.parser``. + Specifies the HTML parser to be used by + [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/). + Values such as ``html5lib``, ``lxml`` can be used but require those parser + to be installed in your project. Defaults to ``html.parser``. bullets An iterable (string, list, or tuple) of bullet styles to be used. If the From 8710308894a2dc6548bfbfcc7d0e4096b8998bf7 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Fri, 21 Mar 2025 16:41:37 +0100 Subject: [PATCH 3/4] Update README.rst --- README.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index b16c5a4..7c518ab 100644 --- a/README.rst +++ b/README.rst @@ -73,12 +73,6 @@ heading_style ``ATX_CLOSED``, ``SETEXT``, and ``UNDERLINED`` (which is an alias for ``SETEXT``). Defaults to ``UNDERLINED``. -beautiful_soup_parser - Specifies the HTML parser to be used by - [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/). - Values such as ``html5lib``, ``lxml`` can be used but require those parser - to be installed in your project. Defaults to ``html.parser``. - bullets An iterable (string, list, or tuple) of bullet styles to be used. If the iterable only contains one item, it will be used regardless of how deeply @@ -163,6 +157,13 @@ strip_document within the document are unaffected. Defaults to ``STRIP``. +beautiful_soup_parser + Specifies the HTML parser to be used by `BeautifulSoup`_. + Values such as ``html5lib``, ``lxml`` can be used but require those parser + to be installed in your project. Defaults to ``html.parser``. + +.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/ + Options may be specified as kwargs to the ``markdownify`` function, or as a nested ``Options`` class in ``MarkdownConverter`` subclasses. From f916d95073f236857a42d0391d022b03ec88620e Mon Sep 17 00:00:00 2001 From: Vincent Kelleher Date: Mon, 24 Mar 2025 14:04:20 +0100 Subject: [PATCH 4/4] add Beautiful Soup parser argument to command line --- README.rst | 6 +++--- markdownify/main.py | 10 +++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 7c518ab..71f6b14 100644 --- a/README.rst +++ b/README.rst @@ -158,9 +158,9 @@ strip_document Defaults to ``STRIP``. beautiful_soup_parser - Specifies the HTML parser to be used by `BeautifulSoup`_. - Values such as ``html5lib``, ``lxml`` can be used but require those parser - to be installed in your project. Defaults to ``html.parser``. + Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such + as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution + environment. Defaults to ``html.parser``. .. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/ diff --git a/markdownify/main.py b/markdownify/main.py index 432efb5..2fc02f8 100644 --- a/markdownify/main.py +++ b/markdownify/main.py @@ -55,7 +55,9 @@ def main(argv=sys.argv[1:]): parser.add_argument('--no-escape-underscores', dest='escape_underscores', action='store_false', help="Do not escape '_' to '\\_' in text.") - parser.add_argument('-i', '--keep-inline-images-in', nargs='*', + parser.add_argument('-i', '--keep-inline-images-in', + default=[], + nargs='*', help="Images are converted to their alt-text when the images are " "located inside headlines or table cells. If some inline images " "should be converted to markdown images instead, this option can " @@ -68,6 +70,12 @@ def main(argv=sys.argv[1:]): parser.add_argument('-w', '--wrap', action='store_true', help="Wrap all text paragraphs at --wrap-width characters.") parser.add_argument('--wrap-width', type=int, default=80) + parser.add_argument('-p', '--beautiful-soup-parser', + dest='beautiful_soup_parser', + default='html.parser', + help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such " + "as html5lib, lxml or even a custom parser as long as it is installed on the execution " + "environment.") args = parser.parse_args(argv) print(markdownify(**vars(args)))