diff --git a/README.rst b/README.rst index 34ed7e0..b37a503 100644 --- a/README.rst +++ b/README.rst @@ -150,6 +150,13 @@ wrap, wrap_width Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs. A `wrap_width` value of `None` reflows lines to unlimited line length. +strip_document + Controls whether leading and/or trailing separation newlines are removed from + the final converted document. Supported values are ``LSTRIP`` (leading), + ``RSTRIP`` (trailing), ``STRIP`` (both), and ``None`` (neither). Newlines + within the document are unaffected. + Defaults to ``STRIP``. + Options may be specified as kwargs to the ``markdownify`` function, or as a nested ``Options`` class in ``MarkdownConverter`` subclasses. diff --git a/markdownify/__init__.py b/markdownify/__init__.py index ef4e7ca..7d14fe7 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -26,6 +26,11 @@ ASTERISK = '*' UNDERSCORE = '_' +# Document strip styles +LSTRIP = 'lstrip' +RSTRIP = 'rstrip' +STRIP = 'strip' + def chomp(text): """ @@ -99,6 +104,7 @@ class DefaultOptions: keep_inline_images_in = [] newline_style = SPACES strip = None + strip_document = STRIP strong_em_symbol = ASTERISK sub_symbol = '' sup_symbol = '' @@ -180,7 +186,18 @@ def process_tag(self, node, convert_as_inline): return text def convert__document_(self, el, text, convert_as_inline): - # for BeautifulSoup objects (where node.name == "[document]"), return content results as-is + """Final document-level formatting for BeautifulSoup object (node.name == "[document]")""" + if self.options['strip_document'] == LSTRIP: + text = text.lstrip('\n') # remove leading separation newlines + elif self.options['strip_document'] == RSTRIP: + text = text.rstrip('\n') # remove trailing separation newlines + elif self.options['strip_document'] == STRIP: + text = text.strip('\n') # remove leading and trailing separation newlines + elif self.options['strip_document'] is None: + pass # leave leading and trailing separation newlines as-is + else: + raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document']) + return text def process_text(self, el): @@ -454,6 +471,7 @@ def _indent_for_li(match): def convert_p(self, el, text, convert_as_inline): if convert_as_inline: return ' ' + text.strip() + ' ' + text = text.strip() if self.options['wrap']: # Preserve newlines (and preceding whitespace) resulting # from
tags. Newlines in the input have already been @@ -500,13 +518,13 @@ def convert_style(self, el, text, convert_as_inline): convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol']) def convert_table(self, el, text, convert_as_inline): - return '\n\n' + text + '\n' + return '\n\n' + text.strip() + '\n\n' def convert_caption(self, el, text, convert_as_inline): - return text + '\n\n' + return text.strip() + '\n\n' def convert_figcaption(self, el, text, convert_as_inline): - return '\n\n' + text + '\n\n' + return '\n\n' + text.strip() + '\n\n' def convert_td(self, el, text, convert_as_inline): colspan = 1 diff --git a/tests/test_advanced.py b/tests/test_advanced.py index a3a5fda..6123d8c 100644 --- a/tests/test_advanced.py +++ b/tests/test_advanced.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md +from .utils import md def test_chomp(): diff --git a/tests/test_args.py b/tests/test_args.py index ebce4a8..301c19f 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -2,7 +2,8 @@ Test whitelisting/blacklisting of specific tags. """ -from markdownify import markdownify as md +from markdownify import markdownify, LSTRIP, RSTRIP, STRIP +from .utils import md def test_strip(): @@ -23,3 +24,11 @@ def test_convert(): def test_do_not_convert(): text = md('Some Text', convert=[]) assert text == 'Some Text' + + +def test_strip_document(): + assert markdownify("

Hello

") == "Hello" # test default of STRIP + assert markdownify("

Hello

", strip_document=LSTRIP) == "Hello\n\n" + assert markdownify("

Hello

", strip_document=RSTRIP) == "\n\nHello" + assert markdownify("

Hello

", strip_document=STRIP) == "Hello" + assert markdownify("

Hello

", strip_document=None) == "\n\nHello\n\n" diff --git a/tests/test_basic.py b/tests/test_basic.py index 66f8b6c..584adb9 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md +from .utils import md def test_single_tag(): diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 05c6cd4..1367006 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -1,4 +1,5 @@ -from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE +from markdownify import ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE +from .utils import md def inline_tests(tag, markup): @@ -79,11 +80,6 @@ def test_br(): assert md('a
b
c', newline_style=BACKSLASH) == 'a\\\nb\\\nc' -def test_caption(): - assert md('TEXT
Caption
SPAN
') == 'TEXT\n\nCaption\n\nSPAN' - assert md('
SPAN
Caption
TEXT') == 'SPAN\n\nCaption\n\nTEXT' - - def test_code(): inline_tests('code', '`') assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' @@ -126,6 +122,11 @@ def test_em(): inline_tests('em', '*') +def test_figcaption(): + assert (md("TEXT
\nCaption\n
SPAN
") == "TEXT\n\nCaption\n\nSPAN") + assert (md("
SPAN
\nCaption\n
TEXT") == "SPAN\n\nCaption\n\nTEXT") + + def test_header_with_space(): assert md('

\n\nHello

') == '\n\n### Hello\n\n' assert md('

Hello\n\n\nWorld

') == '\n\n### Hello World\n\n' @@ -236,6 +237,7 @@ def test_kbd(): def test_p(): assert md('

hello

') == '\n\nhello\n\n' + assert md("

hello

") == "\n\nhello\n\n" assert md('

123456789 123456789

') == '\n\n123456789 123456789\n\n' assert md('

123456789\n\n\n123456789

') == '\n\n123456789\n123456789\n\n' assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n' diff --git a/tests/test_custom_converter.py b/tests/test_custom_converter.py index adc83f7..0d3f6af 100644 --- a/tests/test_custom_converter.py +++ b/tests/test_custom_converter.py @@ -20,8 +20,8 @@ def test_custom_conversion_functions(): def md(html, **options): return UnitTestConverter(**options).convert(html) - assert md('Alt text') == '![Alt text](/path/to/img.jpg "Optional title")\n\n' - assert md('Alt text') == '![Alt text](/path/to/img.jpg)\n\n' + assert md('Alt texttext') == '![Alt text](/path/to/img.jpg "Optional title")\n\ntext' + assert md('Alt texttext') == '![Alt text](/path/to/img.jpg)\n\ntext' assert md("text") == "FUNCTION USED: text" diff --git a/tests/test_escaping.py b/tests/test_escaping.py index 878760a..d213675 100644 --- a/tests/test_escaping.py +++ b/tests/test_escaping.py @@ -1,6 +1,6 @@ import warnings from bs4 import MarkupResemblesLocatorWarning -from markdownify import markdownify as md +from .utils import md def test_asterisks(): diff --git a/tests/test_lists.py b/tests/test_lists.py index ce54a87..6b320ca 100644 --- a/tests/test_lists.py +++ b/tests/test_lists.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md +from .utils import md nested_uls = """ diff --git a/tests/test_tables.py b/tests/test_tables.py index da4bf53..e41b389 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md +from .utils import md table = """ @@ -228,7 +228,10 @@
""" -table_with_caption = """TEXT +table_with_caption = """TEXT
Caption
+ diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..0dac580 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,9 @@ +from markdownify import MarkdownConverter + + +# for unit testing, disable document-level stripping by default so that +# separation newlines are included in testing +def md(html, **options): + options = {"strip_document": None, **options} + + return MarkdownConverter(**options).convert(html)
+ Caption +
Firstname Lastname Age