Skip to content

remove superfluous leading/trailing whitespace #181

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,13 @@ wrap, wrap_width
Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs.
A `wrap_width` value of `None` reflows lines to unlimited line length.

strip_document
Controls whether leading and/or trailing separation newlines are removed from
the final converted document. Supported values are ``LSTRIP`` (leading),
``RSTRIP`` (trailing), ``STRIP`` (both), and ``None`` (no removal). Newlines
within the document are unaffected.
Defaults to ``LSTRIP``.

Options may be specified as kwargs to the ``markdownify`` function, or as a
nested ``Options`` class in ``MarkdownConverter`` subclasses.

Expand Down
26 changes: 22 additions & 4 deletions markdownify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
ASTERISK = '*'
UNDERSCORE = '_'

# Document strip styles
LSTRIP = 'lstrip'
RSTRIP = 'rstrip'
STRIP = 'strip'


def chomp(text):
"""
Expand Down Expand Up @@ -99,6 +104,7 @@ class DefaultOptions:
keep_inline_images_in = []
newline_style = SPACES
strip = None
strip_document = LSTRIP
strong_em_symbol = ASTERISK
sub_symbol = ''
sup_symbol = ''
Expand Down Expand Up @@ -180,7 +186,18 @@ def process_tag(self, node, convert_as_inline):
return text

def convert__document_(self, el, text, convert_as_inline):
# for BeautifulSoup objects (where node.name == "[document]"), return content results as-is
"""Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
if self.options['strip_document'] == LSTRIP:
text = text.lstrip('\n') # remove leading separation newlines
elif self.options['strip_document'] == RSTRIP:
text = text.rstrip('\n') # remove trailing separation newlines
elif self.options['strip_document'] == STRIP:
text = text.strip('\n') # remove leading and trailing separation newlines
elif self.options['strip_document'] is None:
pass # leave leading and trailing separation newlines as-is
else:
raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])

return text

def process_text(self, el):
Expand Down Expand Up @@ -454,6 +471,7 @@ def _indent_for_li(match):
def convert_p(self, el, text, convert_as_inline):
if convert_as_inline:
return ' ' + text.strip() + ' '
text = text.strip()
if self.options['wrap']:
# Preserve newlines (and preceding whitespace) resulting
# from <br> tags. Newlines in the input have already been
Expand Down Expand Up @@ -500,13 +518,13 @@ def convert_style(self, el, text, convert_as_inline):
convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])

def convert_table(self, el, text, convert_as_inline):
return '\n\n' + text + '\n'
return '\n\n' + text.strip() + '\n\n'

def convert_caption(self, el, text, convert_as_inline):
return text + '\n\n'
return text.strip() + '\n\n'

def convert_figcaption(self, el, text, convert_as_inline):
return '\n\n' + text + '\n\n'
return '\n\n' + text.strip() + '\n\n'

def convert_td(self, el, text, convert_as_inline):
colspan = 1
Expand Down
9 changes: 8 additions & 1 deletion tests/test_advanced.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from markdownify import markdownify as md
from markdownify import MarkdownConverter


def md(html, **options):
# disable document-level stripping so separation newlines are included in testing
options = {**options, "strip_document": None}

return MarkdownConverter(**options).convert(html)


def test_chomp():
Expand Down
10 changes: 9 additions & 1 deletion tests/test_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Test whitelisting/blacklisting of specific tags.

"""
from markdownify import markdownify as md
from markdownify import markdownify as md, LSTRIP, RSTRIP, STRIP


def test_strip():
Expand All @@ -23,3 +23,11 @@ def test_convert():
def test_do_not_convert():
text = md('<a href="https://github.com/matthewwithanm">Some Text</a>', convert=[])
assert text == 'Some Text'


def test_strip_document():
assert md("<p>Hello</p>") == "Hello\n\n" # defaults to LSTRIP
assert md("<p>Hello</p>", strip_document=LSTRIP) == "Hello\n\n"
assert md("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
assert md("<p>Hello</p>", strip_document=STRIP) == "Hello"
assert md("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
9 changes: 8 additions & 1 deletion tests/test_basic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from markdownify import markdownify as md
from markdownify import MarkdownConverter


def md(html, **options):
# disable document-level stripping so separation newlines are included in testing
options = {**options, "strip_document": None}

return MarkdownConverter(**options).convert(html)


def test_single_tag():
Expand Down
20 changes: 14 additions & 6 deletions tests/test_conversions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
from markdownify import MarkdownConverter, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE


def md(html, **options):
# disable document-level stripping so separation newlines are included in testing
options = {**options, "strip_document": None}

return MarkdownConverter(**options).convert(html)


def inline_tests(tag, markup):
Expand Down Expand Up @@ -79,11 +86,6 @@ def test_br():
assert md('a<br />b<br />c', newline_style=BACKSLASH) == 'a\\\nb\\\nc'


def test_caption():
assert md('TEXT<figure><figcaption>Caption</figcaption><span>SPAN</span></figure>') == 'TEXT\n\nCaption\n\nSPAN'
assert md('<figure><span>SPAN</span><figcaption>Caption</figcaption></figure>TEXT') == 'SPAN\n\nCaption\n\nTEXT'


def test_code():
inline_tests('code', '`')
assert md('<code>*this_should_not_escape*</code>') == '`*this_should_not_escape*`'
Expand Down Expand Up @@ -126,6 +128,11 @@ def test_em():
inline_tests('em', '*')


def test_figcaption():
assert (md("TEXT<figure><figcaption>\nCaption\n</figcaption><span>SPAN</span></figure>") == "TEXT\n\nCaption\n\nSPAN")
assert (md("<figure><span>SPAN</span><figcaption>\nCaption\n</figcaption></figure>TEXT") == "SPAN\n\nCaption\n\nTEXT")


def test_header_with_space():
assert md('<h3>\n\nHello</h3>') == '\n\n### Hello\n\n'
assert md('<h3>Hello\n\n\nWorld</h3>') == '\n\n### Hello World\n\n'
Expand Down Expand Up @@ -236,6 +243,7 @@ def test_kbd():

def test_p():
assert md('<p>hello</p>') == '\n\nhello\n\n'
assert md("<p><p>hello</p></p>") == "\n\nhello\n\n"
assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
assert md('<p>123456789\n\n\n123456789</p>') == '\n\n123456789\n123456789\n\n'
assert md('<p>123456789\n\n\n123456789</p>', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n'
Expand Down
9 changes: 8 additions & 1 deletion tests/test_escaping.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import warnings
from bs4 import MarkupResemblesLocatorWarning
from markdownify import markdownify as md
from markdownify import MarkdownConverter


def md(html, **options):
# disable document-level stripping so separation newlines are included in testing
options = {**options, "strip_document": None}

return MarkdownConverter(**options).convert(html)


def test_asterisks():
Expand Down
9 changes: 8 additions & 1 deletion tests/test_lists.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from markdownify import markdownify as md
from markdownify import MarkdownConverter


def md(html, **options):
# disable document-level stripping so separation newlines are included in testing
options = {**options, "strip_document": None}

return MarkdownConverter(**options).convert(html)


nested_uls = """
Expand Down
14 changes: 12 additions & 2 deletions tests/test_tables.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from markdownify import markdownify as md
from markdownify import MarkdownConverter


def md(html, **options):
# disable document-level stripping so separation newlines are included in testing
options = {**options, "strip_document": None}

return MarkdownConverter(**options).convert(html)


table = """<table>
Expand Down Expand Up @@ -228,7 +235,10 @@
</tbody>
</table>"""

table_with_caption = """TEXT<table><caption>Caption</caption>
table_with_caption = """TEXT<table>
<caption>
Caption
</caption>
<tbody><tr><td>Firstname</td>
<td>Lastname</td>
<td>Age</td>
Expand Down
Loading