diff --git a/README.rst b/README.rst
index 34ed7e0..b37a503 100644
--- a/README.rst
+++ b/README.rst
@@ -150,6 +150,13 @@ wrap, wrap_width
Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs.
A `wrap_width` value of `None` reflows lines to unlimited line length.
+strip_document
+ Controls whether leading and/or trailing separation newlines are removed from
+ the final converted document. Supported values are ``LSTRIP`` (leading),
+ ``RSTRIP`` (trailing), ``STRIP`` (both), and ``None`` (neither). Newlines
+ within the document are unaffected.
+ Defaults to ``STRIP``.
+
Options may be specified as kwargs to the ``markdownify`` function, or as a
nested ``Options`` class in ``MarkdownConverter`` subclasses.
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index ef4e7ca..7d14fe7 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -26,6 +26,11 @@
ASTERISK = '*'
UNDERSCORE = '_'
+# Document strip styles
+LSTRIP = 'lstrip'
+RSTRIP = 'rstrip'
+STRIP = 'strip'
+
def chomp(text):
"""
@@ -99,6 +104,7 @@ class DefaultOptions:
keep_inline_images_in = []
newline_style = SPACES
strip = None
+ strip_document = STRIP
strong_em_symbol = ASTERISK
sub_symbol = ''
sup_symbol = ''
@@ -180,7 +186,18 @@ def process_tag(self, node, convert_as_inline):
return text
def convert__document_(self, el, text, convert_as_inline):
- # for BeautifulSoup objects (where node.name == "[document]"), return content results as-is
+ """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
+ if self.options['strip_document'] == LSTRIP:
+ text = text.lstrip('\n') # remove leading separation newlines
+ elif self.options['strip_document'] == RSTRIP:
+ text = text.rstrip('\n') # remove trailing separation newlines
+ elif self.options['strip_document'] == STRIP:
+ text = text.strip('\n') # remove leading and trailing separation newlines
+ elif self.options['strip_document'] is None:
+ pass # leave leading and trailing separation newlines as-is
+ else:
+ raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
+
return text
def process_text(self, el):
@@ -454,6 +471,7 @@ def _indent_for_li(match):
def convert_p(self, el, text, convert_as_inline):
if convert_as_inline:
return ' ' + text.strip() + ' '
+ text = text.strip()
if self.options['wrap']:
# Preserve newlines (and preceding whitespace) resulting
# from
tags. Newlines in the input have already been
@@ -500,13 +518,13 @@ def convert_style(self, el, text, convert_as_inline):
convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
def convert_table(self, el, text, convert_as_inline):
- return '\n\n' + text + '\n'
+ return '\n\n' + text.strip() + '\n\n'
def convert_caption(self, el, text, convert_as_inline):
- return text + '\n\n'
+ return text.strip() + '\n\n'
def convert_figcaption(self, el, text, convert_as_inline):
- return '\n\n' + text + '\n\n'
+ return '\n\n' + text.strip() + '\n\n'
def convert_td(self, el, text, convert_as_inline):
colspan = 1
diff --git a/tests/test_advanced.py b/tests/test_advanced.py
index a3a5fda..6123d8c 100644
--- a/tests/test_advanced.py
+++ b/tests/test_advanced.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md
def test_chomp():
diff --git a/tests/test_args.py b/tests/test_args.py
index ebce4a8..301c19f 100644
--- a/tests/test_args.py
+++ b/tests/test_args.py
@@ -2,7 +2,8 @@
Test whitelisting/blacklisting of specific tags.
"""
-from markdownify import markdownify as md
+from markdownify import markdownify, LSTRIP, RSTRIP, STRIP
+from .utils import md
def test_strip():
@@ -23,3 +24,11 @@ def test_convert():
def test_do_not_convert():
text = md('Some Text', convert=[])
assert text == 'Some Text'
+
+
+def test_strip_document():
+ assert markdownify("
Hello
") == "Hello" # test default of STRIP + assert markdownify("Hello
", strip_document=LSTRIP) == "Hello\n\n" + assert markdownify("Hello
", strip_document=RSTRIP) == "\n\nHello" + assert markdownify("Hello
", strip_document=STRIP) == "Hello" + assert markdownify("Hello
", strip_document=None) == "\n\nHello\n\n" diff --git a/tests/test_basic.py b/tests/test_basic.py index 66f8b6c..584adb9 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md +from .utils import md def test_single_tag(): diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 05c6cd4..1367006 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -1,4 +1,5 @@ -from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE +from markdownify import ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE +from .utils import md def inline_tests(tag, markup): @@ -79,11 +80,6 @@ def test_br(): assert md('a*this_should_not_escape*
') == '`*this_should_not_escape*`'
@@ -126,6 +122,11 @@ def test_em():
inline_tests('em', '*')
+def test_figcaption():
+ assert (md("TEXThello
') == '\n\nhello\n\n' + assert md("hello
") == "\n\nhello\n\n" assert md('123456789 123456789
') == '\n\n123456789 123456789\n\n' assert md('123456789\n\n\n123456789
') == '\n\n123456789\n123456789\n\n' assert md('123456789\n\n\n123456789
', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n' diff --git a/tests/test_custom_converter.py b/tests/test_custom_converter.py index adc83f7..0d3f6af 100644 --- a/tests/test_custom_converter.py +++ b/tests/test_custom_converter.py @@ -20,8 +20,8 @@ def test_custom_conversion_functions(): def md(html, **options): return UnitTestConverter(**options).convert(html) - assert md('Firstname | Lastname | Age | diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..0dac580 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,9 @@ +from markdownify import MarkdownConverter + + +# for unit testing, disable document-level stripping by default so that +# separation newlines are included in testing +def md(html, **options): + options = {"strip_document": None, **options} + + return MarkdownConverter(**options).convert(html)