File tree Expand file tree Collapse file tree 4 files changed +26
-13
lines changed Expand file tree Collapse file tree 4 files changed +26
-13
lines changed Original file line number Diff line number Diff line change @@ -157,12 +157,16 @@ strip_document
157
157
within the document are unaffected.
158
158
Defaults to ``STRIP ``.
159
159
160
- beautiful_soup_parser
161
- Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
162
- as `html5lib `, `lxml ` or even a custom parser as long as it is installed on the execution
163
- environment. Defaults to ``html.parser ``.
164
-
165
- .. _BeautifulSoup : https://www.crummy.com/software/BeautifulSoup/
160
+ bs4_options
161
+ Specify additional configuration options for the ``BeautifulSoup `` object
162
+ used to interpret the HTML markup. String and list values (such as ``lxml ``
163
+ or ``html5lib ``) are treated as ``features `` arguments to control parser
164
+ selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"} ``)
165
+ are treated as full kwargs to be used for the BeautifulSoup constructor,
166
+ allowing specification of any parameter. For parameter details, see the
167
+ Beautiful Soup documentation at:
168
+
169
+ .. _BeautifulSoup : https://www.crummy.com/software/BeautifulSoup/bs4/doc/
166
170
167
171
Options may be specified as kwargs to the ``markdownify `` function, or as a
168
172
nested ``Options `` class in ``MarkdownConverter `` subclasses.
Original file line number Diff line number Diff line change @@ -154,7 +154,7 @@ def _next_block_content_sibling(el):
154
154
class MarkdownConverter (object ):
155
155
class DefaultOptions :
156
156
autolinks = True
157
- beautiful_soup_parser = 'html.parser'
157
+ bs4_options = 'html.parser'
158
158
bullets = '*+-' # An iterable of bullet types.
159
159
code_language = ''
160
160
code_language_callback = None
@@ -188,11 +188,15 @@ def __init__(self, **options):
188
188
raise ValueError ('You may specify either tags to strip or tags to'
189
189
' convert, but not both.' )
190
190
191
+ # If a string or list is passed to bs4_options, assume it is a 'features' specification
192
+ if not isinstance (self .options ['bs4_options' ], dict ):
193
+ self .options ['bs4_options' ] = {'features' : self .options ['bs4_options' ]}
194
+
191
195
# Initialize the conversion function cache
192
196
self .convert_fn_cache = {}
193
197
194
198
def convert (self , html ):
195
- soup = BeautifulSoup (html , self .options ['beautiful_soup_parser ' ])
199
+ soup = BeautifulSoup (html , ** self .options ['bs4_options ' ])
196
200
return self .convert_soup (soup )
197
201
198
202
def convert_soup (self , soup ):
Original file line number Diff line number Diff line change @@ -70,12 +70,11 @@ def main(argv=sys.argv[1:]):
70
70
parser .add_argument ('-w' , '--wrap' , action = 'store_true' ,
71
71
help = "Wrap all text paragraphs at --wrap-width characters." )
72
72
parser .add_argument ('--wrap-width' , type = int , default = 80 )
73
- parser .add_argument ('-p' , '--beautiful-soup-parser' ,
74
- dest = 'beautiful_soup_parser' ,
73
+ parser .add_argument ('--bs4-options' ,
75
74
default = 'html.parser' ,
76
- help = "Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
77
- "as html5lib, lxml or even a custom parser as long as it is installed on the execution "
78
- "environment ." )
75
+ help = "Specifies the parser that BeautifulSoup should use to parse "
76
+ "the HTML markup. Examples include 'html5. parser', 'lxml', and "
77
+ "'html5lib' ." )
79
78
80
79
args = parser .parse_args (argv )
81
80
print (markdownify (** vars (args )))
Original file line number Diff line number Diff line change @@ -32,3 +32,9 @@ def test_strip_document():
32
32
assert markdownify ("<p>Hello</p>" , strip_document = RSTRIP ) == "\n \n Hello"
33
33
assert markdownify ("<p>Hello</p>" , strip_document = STRIP ) == "Hello"
34
34
assert markdownify ("<p>Hello</p>" , strip_document = None ) == "\n \n Hello\n \n "
35
+
36
+
37
+ def bs4_options ():
38
+ assert markdownify ("<p>Hello</p>" , bs4_options = "html.parser" ) == "Hello"
39
+ assert markdownify ("<p>Hello</p>" , bs4_options = ["html.parser" ]) == "Hello"
40
+ assert markdownify ("<p>Hello</p>" , bs4_options = {"features" : "html.parser" }) == "Hello"
You can’t perform that action at this time.
0 commit comments