Skip to content

Commit 2d654a6

Browse files
Add beautiful_soup_parser option (#206)
* add beautiful_soup_parser option * add Beautiful Soup parser argument to command line --------- Co-authored-by: Vincent Kelleher <vincent.kelleher-ext@francetravail.fr> Co-authored-by: AlexVonB <AlexVonB@users.noreply.github.com>
1 parent 13183f9 commit 2d654a6

File tree

3 files changed

+18
-2
lines changed

3 files changed

+18
-2
lines changed

README.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,13 @@ strip_document
157157
within the document are unaffected.
158158
Defaults to ``STRIP``.
159159

160+
beautiful_soup_parser
161+
Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
162+
as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
163+
environment. Defaults to ``html.parser``.
164+
165+
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
166+
160167
Options may be specified as kwargs to the ``markdownify`` function, or as a
161168
nested ``Options`` class in ``MarkdownConverter`` subclasses.
162169

markdownify/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ def _next_block_content_sibling(el):
154154
class MarkdownConverter(object):
155155
class DefaultOptions:
156156
autolinks = True
157+
beautiful_soup_parser = 'html.parser'
157158
bullets = '*+-' # An iterable of bullet types.
158159
code_language = ''
159160
code_language_callback = None
@@ -191,7 +192,7 @@ def __init__(self, **options):
191192
self.convert_fn_cache = {}
192193

193194
def convert(self, html):
194-
soup = BeautifulSoup(html, 'html.parser')
195+
soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
195196
return self.convert_soup(soup)
196197

197198
def convert_soup(self, soup):

markdownify/main.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,9 @@ def main(argv=sys.argv[1:]):
5555
parser.add_argument('--no-escape-underscores', dest='escape_underscores',
5656
action='store_false',
5757
help="Do not escape '_' to '\\_' in text.")
58-
parser.add_argument('-i', '--keep-inline-images-in', nargs='*',
58+
parser.add_argument('-i', '--keep-inline-images-in',
59+
default=[],
60+
nargs='*',
5961
help="Images are converted to their alt-text when the images are "
6062
"located inside headlines or table cells. If some inline images "
6163
"should be converted to markdown images instead, this option can "
@@ -68,6 +70,12 @@ def main(argv=sys.argv[1:]):
6870
parser.add_argument('-w', '--wrap', action='store_true',
6971
help="Wrap all text paragraphs at --wrap-width characters.")
7072
parser.add_argument('--wrap-width', type=int, default=80)
73+
parser.add_argument('-p', '--beautiful-soup-parser',
74+
dest='beautiful_soup_parser',
75+
default='html.parser',
76+
help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
77+
"as html5lib, lxml or even a custom parser as long as it is installed on the execution "
78+
"environment.")
7179

7280
args = parser.parse_args(argv)
7381
print(markdownify(**vars(args)))

0 commit comments

Comments
 (0)